Apply clang-format on repository

Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.

Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/

There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.

Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index f324b1a..c035644e 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClActivation.h"
 
@@ -35,18 +36,17 @@
 {
 struct CLActivationLayer::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    CLRuntimeContext                     *ctx{ nullptr };
-    std::unique_ptr<opencl::ClActivation> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    CLRuntimeContext                     *ctx{nullptr};
+    std::unique_ptr<opencl::ClActivation> op{nullptr};
 };
 
-CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
-CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
+CLActivationLayer::CLActivationLayer(CLActivationLayer &&)            = default;
 CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default;
 CLActivationLayer::~CLActivationLayer()                               = default;
 
@@ -55,7 +55,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
 }
 
-void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(const CLCompileContext &compile_context,
+                                  ICLTensor              *input,
+                                  ICLTensor              *output,
+                                  ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -66,7 +69,8 @@
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info);
 }
 
-Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     return opencl::ClActivation::validate(input, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index b30d739..f9bbd31 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -27,31 +27,39 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _not_reshaped_output(), _arg_min_max_kernel(), _reshape(), _reduction_axis()
+    : _memory_group(std::move(memory_manager)),
+      _not_reshaped_output(),
+      _arg_min_max_kernel(),
+      _reshape(),
+      _reduction_axis()
 {
 }
 
 CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
 
-Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid reduction operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     DataType   output_data_type = DataType::S32;
@@ -59,17 +67,18 @@
     const auto input_num_channles = input->num_channels();
     const auto input_qinfo        = input->quantization_info();
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         output_data_type                       = output->data_type();
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
     auto shape_before_reshape = input->tensor_shape();
     shape_before_reshape.set(axis, 1);
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
@@ -85,20 +94,36 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
 }
 
-void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+void CLArgMinMaxLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 int                       axis,
+                                 ICLTensor                *output,
+                                 const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
 
     _reduction_axis = axis;
 
-    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-    DataType          output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+    DataType output_data_type =
+        (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 
-    TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() };
+    TensorShape not_reshaped_output_shape{input->info()->tensor_shape()};
     not_reshaped_output_shape.set(axis, 1);
-    auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+                                                         ->clone()
+                                                         ->set_tensor_shape(not_reshaped_output_shape)
+                                                         .set_data_type(output_data_type)
+                                                         .reset_padding()
+                                                         .set_is_resizable(true));
 
     _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>();
     _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op);
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index e8affc0..0c371c4 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -30,9 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
@@ -43,24 +42,40 @@
 
 CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
 
-void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
+void CLBatchNormalizationLayer::configure(ICLTensor          *input,
+                                          ICLTensor          *output,
+                                          const ICLTensor    *mean,
+                                          const ICLTensor    *var,
+                                          const ICLTensor    *beta,
+                                          const ICLTensor    *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                          const ICLTensor *gamma, float epsilon,
-                                          ActivationLayerInfo act_info)
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                          ICLTensor              *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *var,
+                                          const ICLTensor        *beta,
+                                          const ICLTensor        *gamma,
+                                          float                   epsilon,
+                                          ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
     _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *var,
-                                           const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
     return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
@@ -69,4 +84,4 @@
 {
     CLScheduler::get().enqueue(*_norm_kernel, true);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index d7a4091..a3798da 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -30,14 +30,12 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 
 namespace arm_compute
 {
-CLBatchToSpaceLayer::CLBatchToSpaceLayer()
-    : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
+CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
 {
 }
 
@@ -49,29 +47,43 @@
     _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
     _batch_to_space_kernel->configure(compile_context, input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(
+    const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    int32_t                 block_shape_x,
+                                    int32_t                 block_shape_y,
+                                    ICLTensor              *output,
+                                    const CropInfo         &crop_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output);
     _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index a4712ed..7bfd0e3 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,11 +35,14 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
     k->configure(compile_context, input1, input2, output, BitwiseOperation::AND);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 5964b92..9763915 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -43,4 +42,4 @@
     k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index a07bf17..dd3171b 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,11 +35,14 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseOr::configure(const CLCompileContext &compile_context,
+                            const ICLTensor        *input1,
+                            const ICLTensor        *input2,
+                            ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
     k->configure(compile_context, input1, input2, output, BitwiseOperation::OR);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index f65e2e4..5bee4b3 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseXor::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 48583bf..76e626f 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -23,18 +23,24 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
 
-#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const CLCompileContext         &compile_context,
+                                       const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
 
@@ -44,7 +50,10 @@
     _kernel = std::move(k);
 }
 
-Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 10f7cc2..42ec8f7 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
@@ -37,16 +37,15 @@
 {
 struct CLCast::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCast> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
 };
 
-CLCast::CLCast()
-    : _impl(std::make_unique<Impl>())
+CLCast::CLCast() : _impl(std::make_unique<Impl>())
 {
 }
-CLCast::CLCast(CLCast &&) = default;
+CLCast::CLCast(CLCast &&)            = default;
 CLCast &CLCast::operator=(CLCast &&) = default;
 CLCast::~CLCast()                    = default;
 
@@ -55,7 +54,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
 }
 
-void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+void CLCast::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       ConvertPolicy           policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, policy);
@@ -74,7 +76,7 @@
 
 void CLCast::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index 021f28f..1ee4789 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output,
+                                      unsigned int            num_groups)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
     auto k = std::make_unique<CLChannelShuffleLayerKernel>();
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 192a266..2f54371 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLComparisonKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
@@ -37,25 +37,33 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparison::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output,
+                             ComparisonOperation     operation)
 {
     ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation);
     auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, operation);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
-Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparison::validate(const ITensorInfo  *input1,
+                              const ITensorInfo  *input2,
+                              const ITensorInfo  *output,
+                              ComparisonOperation operation)
 {
     return CLComparisonKernel::validate(input1, input2, output, operation);
 }
@@ -67,25 +75,30 @@
 }
 
 template <ComparisonOperation COP>
-void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
+                                        ICLTensor              *input1,
+                                        ICLTensor              *input2,
+                                        ICLTensor              *output)
 {
     auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, COP);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
 template <ComparisonOperation COP>
-Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status
+CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     return CLComparisonKernel::validate(input1, input2, output, COP);
 }
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 0a8884f..9df1c34 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -24,24 +24,23 @@
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConcatenate.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConcatenate.h"
 
 namespace arm_compute
 {
 struct CLConcatenateLayer::Impl
 {
     std::vector<const ICLTensor *>         srcs{};
-    ICLTensor                             *dst{ nullptr };
-    unsigned int                           num_inputs{ 0 };
-    unsigned int                           axis{ 0 };
-    std::unique_ptr<opencl::ClConcatenate> op{ nullptr };
+    ICLTensor                             *dst{nullptr};
+    unsigned int                           num_inputs{0};
+    unsigned int                           axis{0};
+    std::unique_ptr<opencl::ClConcatenate> op{nullptr};
 };
 
-CLConcatenateLayer::CLConcatenateLayer()
-    : _impl(std::make_unique<Impl>())
+CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -56,7 +55,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
 }
 
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(const CLCompileContext         &compile_context,
+                                   std::vector<const ICLTensor *> &inputs_vector,
+                                   ICLTensor                      *output,
+                                   size_t                          axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis);
@@ -68,7 +70,7 @@
     _impl->op         = std::make_unique<opencl::ClConcatenate>();
 
     std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -76,7 +78,9 @@
     _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis);
 }
 
-Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
     return opencl::ClConcatenate::validate(inputs_vector, output, axis);
 }
@@ -84,7 +88,7 @@
 void CLConcatenateLayer::run()
 {
     ITensorPack pack;
-    for(unsigned i = 0; i < _impl->num_inputs; ++i)
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
         pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp
index 729b973..9d1b368 100644
--- a/src/runtime/CL/functions/CLConv3D.cpp
+++ b/src/runtime/CL/functions/CLConv3D.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLConv3D.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/gpu/cl/operators/ClDirectConv3d.h"
 
 namespace arm_compute
@@ -32,29 +33,38 @@
 
 struct CLConv3D::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    const ICLTensor                        *weights{ nullptr };
-    const ICLTensor                        *biases{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConv3d> op{ nullptr };
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv3d> op{nullptr};
 };
 
-CLConv3D::CLConv3D()
-    : _impl(std::make_unique<Impl>())
+CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>())
 {
 }
 
 CLConv3D::~CLConv3D() = default;
 
-void CLConv3D::configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const ICLTensor  *src,
+                         const ICLTensor  *weights,
+                         const ICLTensor  *biases,
+                         ICLTensor        *dst,
+                         const Conv3dInfo &conv3d_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info);
 }
 
-void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *src,
+                         const ICLTensor        *weights,
+                         const ICLTensor        *biases,
+                         ICLTensor              *dst,
+                         const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(
+        src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
 
     _impl->src     = src;
     _impl->weights = weights;
@@ -62,10 +72,15 @@
     _impl->dst     = dst;
 
     _impl->op = std::make_unique<opencl::ClDirectConv3d>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(),
+                         _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
 }
 
-Status CLConv3D::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status CLConv3D::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
 {
     return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info);
 }
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index b3efe5c..2298f2a 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -27,33 +27,37 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
 struct CLConvertFullyConnectedWeights::Impl
 {
-    const ICLTensor                                        *src{ nullptr };
-    ICLTensor                                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{ nullptr };
+    const ICLTensor                                        *src{nullptr};
+    ICLTensor                                              *dst{nullptr};
+    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr};
 };
-CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights()
-    : _impl(std::make_unique<Impl>())
+CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
 {
 }
 CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default;
 
-void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const ICLTensor   *input,
+                                               ICLTensor         *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
 }
 
-void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context,
+                                               const ICLTensor        *input,
+                                               ICLTensor              *output,
+                                               const TensorShape      &original_input_shape,
+                                               DataLayout              data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout);
@@ -63,8 +67,10 @@
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
     return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index f3c05ad..7767b45 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -28,11 +28,11 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConv2d.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -43,41 +43,59 @@
 {
     MemoryGroup                          memory_group{};
     std::shared_ptr<IMemoryManager>      memory_manager{};
-    std::unique_ptr<opencl::IClOperator> op{ nullptr };
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
     ITensorPack                          run_pack{};
     ITensorPack                          prep_pack{};
     WorkspaceData<CLTensor>              workspace{};
     experimental::MemoryRequirements     aux_mem_req{};
-    std::unique_ptr<IFunction>           func{ nullptr };
+    std::unique_ptr<IFunction>           func{nullptr};
 };
 
-CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
 
 CLConvolutionLayer::~CLConvolutionLayer() = default;
 
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, enable_fast_math, num_groups);
 }
 
-void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                   const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
-                                                    weights_info, CLScheduler::get().target()))
+    switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
+                                                     weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::DIRECT:
@@ -85,7 +103,8 @@
         case ConvolutionMethod::GEMM:
         {
             auto f = std::make_unique<opencl::ClConv2d>();
-            f->configure(compile_context, input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+            f->configure(compile_context, input->info(), weights->info(),
+                         ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
             _impl->op = std::move(f);
             break;
         }
@@ -101,40 +120,52 @@
             break;
     }
 
-    if(_impl->op)
+    if (_impl->op)
     {
         _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
         _impl->aux_mem_req  = _impl->op->workspace();
-        _impl->run_pack     = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->prep_pack    = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace    = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CLConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget  gpu_target  = CLScheduler::get().target();
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
+    switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::DIRECT:
         case ConvolutionMethod::INDIRECT:
         case ConvolutionMethod::GEMM:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
             break;
         }
         case ConvolutionMethod::FFT:
         {
             // Validate FFT-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info,
+                                                                        act_info, enable_fast_math));
             break;
         }
         default:
@@ -145,8 +176,15 @@
     return Status{};
 }
 
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const ActivationLayerInfo &act_info,
+                                                             const GPUTarget            gpu_target,
+                                                             const Size2D              &dilation,
+                                                             bool                       enable_fast_math)
 {
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1);
     return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target);
@@ -158,7 +196,7 @@
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->run();
     }
@@ -170,7 +208,7 @@
 
 void CLConvolutionLayer::prepare()
 {
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->prepare();
     }
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 56400b6..a4f2b06 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCopy.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCopy.h"
 
 #include <utility>
 
@@ -38,16 +38,15 @@
 {
 struct CLCopy::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCopy> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCopy> op{nullptr};
 };
 
-CLCopy::CLCopy()
-    : _impl(std::make_unique<Impl>())
+CLCopy::CLCopy() : _impl(std::make_unique<Impl>())
 {
 }
-CLCopy::CLCopy(CLCopy &&) = default;
+CLCopy::CLCopy(CLCopy &&)            = default;
 CLCopy &CLCopy::operator=(CLCopy &&) = default;
 CLCopy::~CLCopy()                    = default;
 
diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp
index 35ea17c..fc29c43 100644
--- a/src/runtime/CL/functions/CLCrop.cpp
+++ b/src/runtime/CL/functions/CLCrop.cpp
@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCrop.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCrop.h"
 
 #include <utility>
 
@@ -38,27 +38,38 @@
 {
 struct CLCrop::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCrop> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCrop> op{nullptr};
 };
 
-CLCrop::CLCrop()
-    : _impl(std::make_unique<Impl>())
+CLCrop::CLCrop() : _impl(std::make_unique<Impl>())
 {
 }
-CLCrop::CLCrop(CLCrop &&) = default;
+CLCrop::CLCrop(CLCrop &&)            = default;
 CLCrop &CLCrop::operator=(CLCrop &&) = default;
 CLCrop::~CLCrop()                    = default;
 
-void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void CLCrop::configure(const ICLTensor *src,
+                       ICLTensor       *dst,
+                       Coordinates2D    start,
+                       Coordinates2D    end,
+                       uint32_t         batch_index,
+                       float            extrapolation_value,
+                       Window          *dst_window)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window);
+    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value,
+              dst_window);
 }
 
-void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void CLCrop::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *src,
+                       ICLTensor              *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
@@ -67,10 +78,17 @@
     _impl->dst = dst;
 
     _impl->op = std::make_unique<opencl::ClCrop>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index,
+                         extrapolation_value, dst_window);
 }
 
-Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status CLCrop::validate(const ITensorInfo *input,
+                        const ITensorInfo *output,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
 {
     return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window);
 }
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index d8fc38d..821412b 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,19 +25,26 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include "src/common/utils/Log.h"
-
 #include <cstddef>
 
 namespace arm_compute
 {
 namespace
 {
-inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+inline void configure_crop(const ICLTensor *input,
+                           ICLTensor       *crop_boxes,
+                           ICLTensor       *box_ind,
+                           ICLTensor       *output,
+                           uint32_t         crop_box_ind,
+                           Coordinates     &start,
+                           Coordinates     &end,
+                           uint32_t        &batch_index)
 {
     batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
 
@@ -50,30 +57,48 @@
     // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
     start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
                         std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
-                      std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
+    end   = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                        std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1,
+                                static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
     output->info()->set_tensor_shape(out_shape);
 }
 } // namespace
 
 CLCropResize::CLCropResize()
-    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions()
+    : _input(nullptr),
+      _boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _scale(),
+      _copy(),
+      _crop_results(),
+      _scaled_results(),
+      _internal_functions()
 {
 }
 
 CLCropResize::~CLCropResize() = default;
 
-Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status CLCropResize::validate(const ITensorInfo  *input,
+                              ITensorInfo        *boxes,
+                              ITensorInfo        *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1},
+                                                 input->dimension(3) - 1, extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -83,20 +108,34 @@
     return Status{};
 }
 
-void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const ICLTensor    *input,
+                             ICLTensor          *boxes,
+                             ICLTensor          *box_ind,
+                             ICLTensor          *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method,
+              extrapolation_value);
 }
 
-void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *boxes,
+                             ICLTensor              *box_ind,
+                             ICLTensor              *output,
+                             Coordinates2D           crop_size,
+                             InterpolationPolicy     method,
+                             float                   extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
-    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
     ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
-    TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+    TensorShape output_shape =
+        TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
     auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
@@ -122,7 +161,7 @@
     // kernels used for cropping and scaling.
     _boxes->map(CLScheduler::get().queue());
     _box_ind->map(CLScheduler::get().queue());
-    for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
+    for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
     {
         auto       crop_tensor = std::make_unique<CLTensor>();
         TensorInfo crop_result_info(1, DataType::F32);
@@ -143,7 +182,9 @@
         configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
 
         auto scale_kernel = std::make_unique<CLScale>();
-        scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT });
+        scale_kernel->configure(
+            compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(),
+            ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT});
         _scale.emplace_back(std::move(scale_kernel));
 
         Window win = calculate_max_window(*_output->info());
@@ -159,28 +200,50 @@
         bool is_width_flipped  = end[0] < start[0];
         bool is_height_flipped = end[1] < start[1];
         /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+        std::array<int32_t, 2> rows_out_of_bounds{0};
         /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> cols_out_of_bounds{ 0 };
-        if(is_height_flipped)
+        std::array<int32_t, 2> cols_out_of_bounds{0};
+        if (is_height_flipped)
         {
-            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
-            rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(start[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
+            rows_out_of_bounds[1] =
+                end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                           : 0;
         }
         else
         {
-            rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
-            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+            rows_out_of_bounds[0] =
+                start[1] < 0
+                    ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                    : 0;
+            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(end[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
         }
-        if(is_width_flipped)
+        if (is_width_flipped)
         {
-            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
-            cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(start[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
+            cols_out_of_bounds[1] =
+                end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                           : 0;
         }
         else
         {
-            cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
-            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+            cols_out_of_bounds[0] =
+                start[0] < 0
+                    ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                    : 0;
+            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(end[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
         }
 
         Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
@@ -203,67 +266,84 @@
         // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
         // with the extrapolation value using memset.
         // First for the rows before the in bounds rows.
-        if(rows_out_of_bounds[0] > 0)
+        if (rows_out_of_bounds[0] > 0)
         {
             Window slice_fill_rows_before(full_window);
             slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
             auto kernel = std::make_unique<CLFill>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_before);
             //_internal_functions.emplace_back(std::move(kernel));
             _internal_functions.push_back(std::move(kernel));
         }
 
         Window slice_in(full_window);
-        slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
-        slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+        slice_in.set(2,
+                     Window::Dimension(rows_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+        slice_in.set(1,
+                     Window::Dimension(cols_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
 
-        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
-        if(rows_in_bounds > 0)
+        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) -
+                             rows_out_of_bounds[0] - rows_out_of_bounds[1];
+        if (rows_in_bounds > 0)
         {
             // Fill all elements that share a row with an in bounds element with the extrapolation value.
-            if(cols_out_of_bounds[0] > 0)
+            if (cols_out_of_bounds[0] > 0)
             {
                 Window slice_fill_cols_before(slice_in);
                 slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
                 auto kernel = std::make_unique<CLFill>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_before);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
 
-            if(cols_out_of_bounds[1] > 0)
+            if (cols_out_of_bounds[1] > 0)
             {
                 Window slice_fill_cols_after(slice_in);
-                slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
+                slice_fill_cols_after.set(
+                    1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1],
+                                         _crop_results[num_box].get()->info()->dimension(1), 1));
                 auto kernel = std::make_unique<CLFill>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_after);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
 
             // Copy all elements within the input bounds from the input tensor.
-            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
-            if(cols_in_bounds > 0)
+            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) -
+                                 cols_out_of_bounds[0] - cols_out_of_bounds[1];
+            if (cols_in_bounds > 0)
             {
-                Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
-                                        is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
-                Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
-                                      is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+                Coordinates2D start_in{
+                    is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+                    is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]};
+                Coordinates2D end_in{
+                    is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+                    is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1};
                 auto kernel = std::make_unique<CLCrop>();
 
-                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
+                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index,
+                                  extrapolation_value, &slice_in);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
         }
 
         // Fill all rows after the in bounds elements with the extrapolation value.
-        if(rows_out_of_bounds[1] > 0)
+        if (rows_out_of_bounds[1] > 0)
         {
             Window slice_fill_rows_after(full_window);
-            slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
+            slice_fill_rows_after.set(
+                2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1],
+                                     _crop_results[num_box].get()->info()->dimension(2), 1));
             auto kernel = std::make_unique<CLFill>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_after);
             //_internal_functions.emplace_back(std::move(kernel));
             _internal_functions.push_back(std::move(kernel));
         }
@@ -277,18 +357,18 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _internal_functions.size(); ++i)
+    for (unsigned int i = 0; i < _internal_functions.size(); ++i)
     {
         _internal_functions[i]->run();
     }
 
     CLScheduler::get().sync();
-    for(auto &kernel : _scale)
+    for (auto &kernel : _scale)
     {
         kernel->run();
     }
     CLScheduler::get().sync();
-    for(auto &kernel : _copy)
+    for (auto &kernel : _copy)
     {
         kernel->run();
     }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 4421a18..e988ab0 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -25,16 +25,16 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/IClOperator.h"
 #include "src/gpu/cl/operators/ClTransposedConvolution.h"
 
-#include "src/common/utils/Log.h"
-
 #include <cmath>
 #include <memory>
 #include <tuple>
@@ -44,11 +44,11 @@
 
 struct CLDeconvolutionLayer::Impl
 {
-    const ICLTensor                     *src{ nullptr };
-    const ICLTensor                     *weights{ nullptr };
-    const ICLTensor                     *biases{ nullptr };
-    ICLTensor                           *dst{ nullptr };
-    std::unique_ptr<opencl::IClOperator> op{ nullptr };
+    const ICLTensor                     *src{nullptr};
+    const ICLTensor                     *weights{nullptr};
+    const ICLTensor                     *biases{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
 };
 
 CLDeconvolutionLayer::~CLDeconvolutionLayer() = default;
@@ -58,24 +58,35 @@
 {
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(ICLTensor           *input,
+                                     ICLTensor           *weights,
+                                     const ICLTensor     *bias,
+                                     ICLTensor           *output,
+                                     const PadStrideInfo &deconv_info,
+                                     const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
 }
 
-void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *weights,
+                                     const ICLTensor        *bias,
+                                     ICLTensor              *output,
+                                     const PadStrideInfo    &deconv_info,
+                                     const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info);
 
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(),
+                                                           deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
             auto op = std::make_unique<opencl::ClTransposedConvolution>();
-            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info);
+            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr,
+                          output->info(), deconv_info);
 
             _impl->src     = input;
             _impl->weights = weights;
@@ -105,22 +116,28 @@
     }
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                      const WeightsInfo &weights_info)
+Status CLDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      ITensorInfo         *output,
+                                      const PadStrideInfo &deconv_info,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
             // Validate transposed convolution operator
-            ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
             break;
         }
         case DeconvolutionMethod::UPSCALE_CONV2D:
         {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
             break;
         }
         case DeconvolutionMethod::GEMM:
@@ -137,12 +154,16 @@
     return Status{};
 }
 
-DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                                                   const WeightsInfo &weights_info)
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo   *input,
+                                                                   const ITensorInfo   *weights,
+                                                                   const ITensorInfo   *bias,
+                                                                   ITensorInfo         *output,
+                                                                   const PadStrideInfo &deconv_info,
+                                                                   const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         return DeconvolutionMethod::UPSCALE_CONV2D;
     }
@@ -154,11 +175,12 @@
     const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
     const size_t ofm   = weights->tensor_shape()[idx_n];
 
-    if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
+    if (weights->dimension(idx_w) != deconv_info.stride().first ||
+        weights->dimension(idx_h) != deconv_info.stride().second)
     {
         // We observe better performance for FP32 types only when ofm <= 16.
         // A better heuristic is required for selecting the method for FP16 data types.
-        if(input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
+        if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
         {
             return DeconvolutionMethod::DIRECT;
         }
@@ -175,7 +197,7 @@
 {
     prepare();
 
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
         // Optimized Operator will be used
         ITensorPack pack;
@@ -195,7 +217,7 @@
 
 void CLDeconvolutionLayer::prepare()
 {
-    if(_impl->op == nullptr)
+    if (_impl->op == nullptr)
     {
         _function->prepare();
     }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 0b428f5..b92bf90 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -27,22 +27,21 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 namespace arm_compute
 {
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
-    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()),
-      _fill(),
-      _output(nullptr)
+    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr)
 {
 }
 
 CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
 
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
+Status
+CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
 {
     return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
 }
@@ -52,13 +51,17 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, info);
 
     _output = output;
-    _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+    _fill.configure(compile_context, _output,
+                    PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
     _upsample->configure(compile_context, input, _output, info);
 }
 
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index cac3f51..6d2fea9 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
@@ -37,16 +37,15 @@
 {
 struct CLDepthConvertLayer::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCast> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
 };
 
-CLDepthConvertLayer::CLDepthConvertLayer()
-    : _impl(std::make_unique<Impl>())
+CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&)            = default;
 CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default;
 CLDepthConvertLayer::~CLDepthConvertLayer()                                 = default;
 
@@ -55,7 +54,11 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
 }
 
-void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    ConvertPolicy           policy,
+                                    uint32_t                shift)
 {
     ARM_COMPUTE_UNUSED(shift);
     ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift);
@@ -70,7 +73,8 @@
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
 }
 
-Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
     return opencl::ClCast::validate(input, output, policy);
@@ -78,7 +82,7 @@
 
 void CLDepthConvertLayer::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 98531e7..9477c7f 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 
-#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 
 #include <utility>
 
@@ -36,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
     auto k = std::make_unique<CLDepthToSpaceLayerKernel>();
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index dcb982f..873601b 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -29,12 +29,12 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
 #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 using namespace arm_compute::misc;
@@ -63,25 +63,33 @@
 
 CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
 
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor           *input,
+                                            const ICLTensor     *weights,
+                                            const ICLTensor     *biases,
+                                            ICLTensor           *output,
+                                            const PadStrideInfo &conv_info,
+                                            unsigned int         depth_multiplier,
+                                            ActivationLayerInfo  act_info,
+                                            const Size2D        &dilation)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier,
+              act_info, dilation);
 }
 
-void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
-                                            ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context,
+                                            ICLTensor              *input,
+                                            const ICLTensor        *weights,
+                                            const ICLTensor        *biases,
+                                            ICLTensor              *output,
+                                            const PadStrideInfo    &conv_info,
+                                            unsigned int            depth_multiplier,
+                                            ActivationLayerInfo     act_info,
+                                            const Size2D           &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
-                                                                     weights->info(),
-                                                                     biases != nullptr ? biases->info() : nullptr,
-                                                                     output != nullptr ? output->info() : input->info(),
-                                                                     conv_info,
-                                                                     depth_multiplier,
-                                                                     act_info,
-                                                                     dilation));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr,
+        output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
 
     _is_quantized     = is_data_type_quantized(input->info()->data_type());
@@ -96,7 +104,7 @@
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -119,10 +127,12 @@
 
     CLTensor *output_multipliers_to_use = nullptr;
     CLTensor *output_shifts_to_use      = nullptr;
-    if(_is_quantized)
+    if (_is_quantized)
     {
-        const size_t idx_c       = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+        const size_t idx_c =
+            get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t num_filters =
+            (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
 
         _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
         _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
@@ -132,16 +142,18 @@
     }
 
     // Get the depthwise convolution compute parameters
-    auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-    const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
+    auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+    const DWCComputeKernelInfo dwc_native_compute_info =
+        t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
 
-    const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     _dwc_native_kernel->set_target(gpu_target);
     _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
-                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use);
+                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use,
+                                  output_shifts_to_use);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
 
@@ -151,22 +163,27 @@
         _permuted_output.allocator()->allocate();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _output_multipliers.allocator()->allocate();
         _output_shifts.allocator()->allocate();
     }
 }
 
-Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo   *input,
+                                             const ITensorInfo   *weights,
+                                             const ITensorInfo   *biases,
+                                             const ITensorInfo   *output,
                                              const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+                                             unsigned int         depth_multiplier,
+                                             ActivationLayerInfo  act_info,
+                                             const Size2D        &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
 
     const bool in_place = input == output || output == nullptr;
-    if(in_place)
+    if (in_place)
     {
         output = input;
     }
@@ -174,21 +191,23 @@
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) >
+                                input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) >
+                                input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     const bool needs_permute = input->data_layout() == DataLayout::NCHW;
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
     TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
-    if(is_quantized)
+    if (is_quantized)
     {
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
 
@@ -201,40 +220,57 @@
         }
     }
 
-    if(needs_permute)
+    if (needs_permute)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
         TensorShape           permuted_input_shape   = input->tensor_shape();
         TensorShape           permuted_weights_shape = weights->tensor_shape();
-        const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
-        TensorShape           permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+        const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation};
+        TensorShape           permuted_output_shape =
+            shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
 
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_input = input->clone()
+                                              ->set_is_resizable(true)
+                                              .reset_padding()
+                                              .set_tensor_shape(permuted_input_shape)
+                                              .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_weights = weights->clone()
+                                                ->set_is_resizable(true)
+                                                .reset_padding()
+                                                .set_tensor_shape(permuted_weights_shape)
+                                                .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_output = output->clone()
+                                               ->set_is_resizable(true)
+                                               .reset_padding()
+                                               .set_tensor_shape(permuted_output_shape)
+                                               .set_data_layout(DataLayout::NHWC);
 
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
 
         // Get the depthwise convolution compute parameters
-        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
-                                                                                      dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info,
+            &output_multipliers_shifts_info, &output_multipliers_shifts_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
     }
     else
     {
         // Get the depthwise convolution compute parameters
-        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
-                                                                                      &output_multipliers_shifts_info));
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(input, weights, conv_info, dilation, depth_multiplier);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
+            &output_multipliers_shifts_info));
     }
     return Status{};
 }
@@ -245,12 +281,12 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_to_nhwc.run();
     }
     CLScheduler::get().enqueue(*_dwc_native_kernel);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_to_nchw.run();
     }
@@ -258,22 +294,21 @@
 
 void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _output_multipliers.map();
             _output_shifts.map();
-            quantization::compute_quantized_multipliers_and_shifts(_input->info(),
-                                                                   _original_weights->info(),
-                                                                   _output != nullptr ? _output->info() : _input->info(),
-                                                                   reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
-                                                                   reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+            quantization::compute_quantized_multipliers_and_shifts(
+                _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(),
+                reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+                reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
             _output_multipliers.unmap();
             _output_shifts.unmap();
         }
 
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 64c6b5d..20162a0 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -26,22 +26,21 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClDequantize.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClDequantize.h"
 
 namespace arm_compute
 {
 struct CLDequantizationLayer::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClDequantize> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClDequantize> op{nullptr};
 };
 
-CLDequantizationLayer::CLDequantizationLayer()
-    : _impl(std::make_unique<Impl>())
+CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLDequantizationLayer::~CLDequantizationLayer() = default;
@@ -51,7 +50,9 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
 }
 
-void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output);
     _impl->src = input;
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 752e0e4..d6dae0d 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -28,37 +28,46 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/operators/ClActivation.h"
-#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLDirectConvolutionLayer::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    const ICLTensor                        *weights{ nullptr };
-    const ICLTensor                        *biases{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv2d> op{nullptr};
 };
 
-CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _impl(std::make_unique<Impl>())
+CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default;
+CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&)            = default;
 CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default;
 CLDirectConvolutionLayer::~CLDirectConvolutionLayer()                                      = default;
 
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                         const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -69,10 +78,15 @@
     _impl->dst     = output;
 
     _impl->op = std::make_unique<opencl::ClDirectConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
     return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -87,4 +101,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 88c3c61..3717f30 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -26,15 +26,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 #include <memory>
 #include <tuple>
 
@@ -55,11 +55,16 @@
 {
 }
 
-Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                                            const WeightsInfo &weights_info)
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                            const ITensorInfo   *weights,
+                                            const ITensorInfo   *bias,
+                                            ITensorInfo         *output,
+                                            const PadStrideInfo &info,
+                                            const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     const DataLayout data_layout = input->data_layout();
 
@@ -70,20 +75,22 @@
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1);
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                    weights->dimension(idx_w), weights->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-    if(input->data_type() != weights->data_type())
+    if (input->data_type() != weights->data_type())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL ||
+                                    !is_data_type_quantized_asymmetric(input->data_type()));
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -102,24 +109,39 @@
     unsigned int        deconv_pad_y    = 0;
     const unsigned int  stride_x        = info.stride().first;
     const unsigned int  stride_y        = info.stride().second;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                                out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo          scale_out_info(input->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(scale_out_shape)
+                                           .set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
     return Status{};
 }
 
-void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(ICLTensor           *input,
+                                           ICLTensor           *weights,
+                                           const ICLTensor     *bias,
+                                           ICLTensor           *output,
+                                           const PadStrideInfo &info,
+                                           const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
 }
 
-void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor              *input,
+                                           ICLTensor              *weights,
+                                           const ICLTensor        *bias,
+                                           ICLTensor              *output,
+                                           const PadStrideInfo    &info,
+                                           const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info);
@@ -141,15 +163,19 @@
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
+    auto out_dims =
+        deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+                                        weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
 
     _is_prepared = weights_info.retain_internal_weights();
 
@@ -158,7 +184,8 @@
     // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
     unsigned int      deconv_pad_x    = 0;
     unsigned int      deconv_pad_y    = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
 
     unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
     unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
@@ -179,7 +206,8 @@
     _scaled_output.allocator()->init(scale_out_info);
 
     // configure scale function
-    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top,
+                                      deconv_pad_bottom, DimensionRoundingType::FLOOR);
     _scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
 
     // Setup the function to convolve the upscaled output
@@ -191,7 +219,7 @@
     _flip_axis.allocator()->allocate();
     _flip_axis.map(true);
     auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
-    if(weights->info()->data_layout() == DataLayout::NHWC)
+    if (weights->info()->data_layout() == DataLayout::NHWC)
     {
         axis_data[0] = 1;
         axis_data[1] = 2;
@@ -216,7 +244,7 @@
 
 void CLDirectDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
@@ -229,7 +257,7 @@
         _conv_f.prepare();
 
         // Free flipped weights
-        if(!_weights_flipped.is_used())
+        if (!_weights_flipped.is_used())
         {
             _weights_flipped.allocator()->free();
         }
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 936b37f..d9529f0 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
 
+#include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClAdd.h"
 #include "src/gpu/cl/operators/ClElementwiseOperations.h"
 #include "src/gpu/cl/operators/ClSub.h"
@@ -36,26 +36,30 @@
 {
 struct CLArithmeticAddition::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClAdd> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAdd> op{nullptr};
 };
 
-CLArithmeticAddition::CLArithmeticAddition()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
+CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&)            = default;
 CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default;
 CLArithmeticAddition::~CLArithmeticAddition()                                  = default;
 
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(
+    ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticAddition::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     ConvertPolicy              policy,
                                      const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
@@ -65,7 +69,11 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
 {
     return opencl::ClAdd::validate(input1, input2, output, policy, act_info);
 }
@@ -82,26 +90,33 @@
 
 struct CLArithmeticSubtraction::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClSub> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSub> op{nullptr};
 };
 
-CLArithmeticSubtraction::CLArithmeticSubtraction()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&)            = default;
 CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default;
 CLArithmeticSubtraction::~CLArithmeticSubtraction()                                     = default;
 
-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticSubtraction::configure(const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticSubtraction::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
                                         const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
@@ -111,7 +126,11 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
 {
     return opencl::ClSub::validate(input1, input2, output, policy, act_info);
 }
@@ -128,26 +147,32 @@
 
 struct CLArithmeticDivision::Impl
 {
-    const ICLTensor                               *src_0{ nullptr };
-    const ICLTensor                               *src_1{ nullptr };
-    ICLTensor                                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseDivision> op{ nullptr };
+    const ICLTensor                               *src_0{nullptr};
+    const ICLTensor                               *src_1{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr};
 };
 
-CLArithmeticDivision::CLArithmeticDivision()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
+CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&)            = default;
 CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default;
 CLArithmeticDivision::~CLArithmeticDivision()                                  = default;
 
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(ICLTensor                 *input1,
+                                     ICLTensor                 *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -156,7 +181,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLArithmeticDivision::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info);
 }
@@ -173,26 +201,32 @@
 
 struct CLElementwiseMax::Impl
 {
-    const ICLTensor                          *src_0{ nullptr };
-    const ICLTensor                          *src_1{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseMax> op{ nullptr };
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMax> op{nullptr};
 };
 
-CLElementwiseMax::CLElementwiseMax()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
+CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&)            = default;
 CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default;
 CLElementwiseMax::~CLElementwiseMax()                              = default;
 
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -201,7 +235,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseMax::validate(input1, input2, output, act_info);
 }
@@ -218,26 +255,32 @@
 
 struct CLElementwiseMin::Impl
 {
-    const ICLTensor                          *src_0{ nullptr };
-    const ICLTensor                          *src_1{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseMin> op{ nullptr };
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMin> op{nullptr};
 };
 
-CLElementwiseMin::CLElementwiseMin()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
+CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&)            = default;
 CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default;
 CLElementwiseMin::~CLElementwiseMin()                              = default;
 
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -246,7 +289,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseMin::validate(input1, input2, output, act_info);
 }
@@ -263,26 +309,32 @@
 
 struct CLElementwiseSquaredDiff::Impl
 {
-    const ICLTensor                                  *src_0{ nullptr };
-    const ICLTensor                                  *src_1{ nullptr };
-    ICLTensor                                        *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{ nullptr };
+    const ICLTensor                                  *src_0{nullptr};
+    const ICLTensor                                  *src_1{nullptr};
+    ICLTensor                                        *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr};
 };
 
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&)            = default;
 CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default;
 CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff()                                      = default;
 
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -291,7 +343,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info);
 }
@@ -308,26 +363,32 @@
 
 struct CLElementwisePower::Impl
 {
-    const ICLTensor                            *src_0{ nullptr };
-    const ICLTensor                            *src_1{ nullptr };
-    ICLTensor                                  *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwisePower> op{ nullptr };
+    const ICLTensor                            *src_0{nullptr};
+    const ICLTensor                            *src_1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwisePower> op{nullptr};
 };
 
-CLElementwisePower::CLElementwisePower()
-    : _impl(std::make_unique<Impl>())
+CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
+CLElementwisePower::CLElementwisePower(CLElementwisePower &&)            = default;
 CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default;
 CLElementwisePower::~CLElementwisePower()                                = default;
 
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -336,7 +397,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwisePower::validate(input1, input2, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
index 9dcd2d1..3043c26 100644
--- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
@@ -32,17 +33,16 @@
 {
 struct CLRsqrtLayer::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClRsqrt> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRsqrt> op{nullptr};
 };
 
-CLRsqrtLayer::CLRsqrtLayer()
-    : _impl(std::make_unique<Impl>())
+CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
+CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&)            = default;
 CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default;
 CLRsqrtLayer::~CLRsqrtLayer()                          = default;
 
@@ -74,17 +74,16 @@
 
 struct CLExpLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClExp> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClExp> op{nullptr};
 };
 
-CLExpLayer::CLExpLayer()
-    : _impl(std::make_unique<Impl>())
+CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
+CLExpLayer::CLExpLayer(CLExpLayer &&)            = default;
 CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default;
 CLExpLayer::~CLExpLayer()                        = default;
 
@@ -116,17 +115,16 @@
 
 struct CLNegLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClNeg> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClNeg> op{nullptr};
 };
 
-CLNegLayer::CLNegLayer()
-    : _impl(std::make_unique<Impl>())
+CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
+CLNegLayer::CLNegLayer(CLNegLayer &&)            = default;
 CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default;
 CLNegLayer::~CLNegLayer()                        = default;
 
@@ -157,17 +155,16 @@
 
 struct CLSinLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClSin> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSin> op{nullptr};
 };
 
-CLSinLayer::CLSinLayer()
-    : _impl(std::make_unique<Impl>())
+CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
+CLSinLayer::CLSinLayer(CLSinLayer &&)            = default;
 CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default;
 CLSinLayer::~CLSinLayer()                        = default;
 
@@ -198,17 +195,16 @@
 
 struct CLAbsLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClAbs> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAbs> op{nullptr};
 };
 
-CLAbsLayer::CLAbsLayer()
-    : _impl(std::make_unique<Impl>())
+CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
+CLAbsLayer::CLAbsLayer(CLAbsLayer &&)            = default;
 CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default;
 CLAbsLayer::~CLAbsLayer()                        = default;
 
@@ -239,17 +235,16 @@
 
 struct CLLogLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClLog> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClLog> op{nullptr};
 };
 
-CLLogLayer::CLLogLayer()
-    : _impl(std::make_unique<Impl>())
+CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
+CLLogLayer::CLLogLayer(CLLogLayer &&)            = default;
 CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default;
 CLLogLayer::~CLLogLayer()                        = default;
 
@@ -280,17 +275,16 @@
 
 struct CLRoundLayer::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClRound> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRound> op{nullptr};
 };
 
-CLRoundLayer::CLRoundLayer()
-    : _impl(std::make_unique<Impl>())
+CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
+CLRoundLayer::CLRoundLayer(CLRoundLayer &&)            = default;
 CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default;
 CLRoundLayer::~CLRoundLayer()                          = default;
 
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index bd0966b..48e9ae8 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -26,13 +26,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 #include "src/core/utils/helpers/fft.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,7 +54,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+void CLFFT1D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT1DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
@@ -77,13 +80,14 @@
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices,
+                                     digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
     _num_ffts       = decomposed_vector.size();
     _fft_kernels.reserve(_num_ffts);
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -93,18 +97,20 @@
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
         _fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>());
-        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input,
+                                       ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config)
+               : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -123,7 +129,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
@@ -132,7 +138,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
@@ -151,13 +157,13 @@
     CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
 
     // Run radix kernels
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
         CLScheduler::get().enqueue(*_scale_kernel, true);
     }
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 94fc411..3857046 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -26,16 +26,19 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
@@ -46,7 +49,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+void CLFFT2D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT2DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
@@ -88,7 +94,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index d12e2de..3894b10 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -25,10 +25,12 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
@@ -38,8 +40,6 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
@@ -50,11 +50,11 @@
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -104,17 +104,31 @@
 {
 }
 
-void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                      ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
-    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(),
+                                                               biases != nullptr ? biases->info() : nullptr,
+                                                               output->info(), conv_info, act_info, enable_fast_math));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 
     _original_weights = weights;
@@ -124,21 +138,24 @@
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -146,7 +163,7 @@
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -167,7 +184,7 @@
     _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
@@ -175,10 +192,10 @@
     _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -202,7 +219,8 @@
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -214,25 +232,28 @@
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use,
+                                   Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -243,7 +264,7 @@
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -255,7 +276,7 @@
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(compile_context, output, nullptr, act_info);
     }
@@ -269,8 +290,13 @@
     _flip_axis.unmap();
 }
 
-Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math);
@@ -287,24 +313,27 @@
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x());
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
         }
@@ -320,7 +349,7 @@
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -336,17 +365,17 @@
     _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
     _extract_output_func.run();
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -354,10 +383,10 @@
 
 void CLFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -366,7 +395,7 @@
 
         const ICLTensor *cur_weights = _original_weights;
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 6019a84..9bd96a9 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClFill.h"
 
@@ -36,16 +37,15 @@
 {
 struct CLFill::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClFill> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClFill> op{nullptr};
 };
 
-CLFill::CLFill()
-    : _impl(std::make_unique<Impl>())
+CLFill::CLFill() : _impl(std::make_unique<Impl>())
 {
 }
-CLFill::CLFill(CLFill &&) = default;
+CLFill::CLFill(CLFill &&)            = default;
 CLFill &CLFill::operator=(CLFill &&) = default;
 CLFill::~CLFill()                    = default;
 
@@ -54,7 +54,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window);
 }
 
-void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window)
+void CLFill::configure(const CLCompileContext &compile_context,
+                       ICLTensor              *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 32fc375..ba1b537 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -26,8 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/operators/ClFlatten.h"
@@ -36,16 +37,15 @@
 {
 struct CLFlattenLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClFlatten> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClFlatten> op{nullptr};
 };
 
-CLFlattenLayer::CLFlattenLayer()
-    : _impl(std::make_unique<Impl>())
+CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default;
+CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&)            = default;
 CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default;
 CLFlattenLayer::~CLFlattenLayer()                            = default;
 
@@ -59,7 +59,8 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _impl->src = input;
     _impl->dst = output;
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
 
     _impl->op = std::make_unique<opencl::ClFlatten>();
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
@@ -68,9 +69,10 @@
 Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
     }
     return opencl::ClFlatten::validate(input, output);
@@ -83,4 +85,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 8739e18..4322219 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClFloor.h"
 
@@ -34,16 +35,15 @@
 {
 struct CLFloor::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClFloor> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClFloor> op{nullptr};
 };
 
-CLFloor::CLFloor()
-    : _impl(std::make_unique<Impl>())
+CLFloor::CLFloor() : _impl(std::make_unique<Impl>())
 {
 }
-CLFloor::CLFloor(CLFloor &&) = default;
+CLFloor::CLFloor(CLFloor &&)            = default;
 CLFloor &CLFloor::operator=(CLFloor &&) = default;
 CLFloor::~CLFloor()                     = default;
 
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 1c162db..b30f9e7 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClFullyConnected.h"
 
@@ -35,21 +36,22 @@
 struct CLFullyConnectedLayer::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<opencl::ClFullyConnected> op{ nullptr };
+    std::unique_ptr<opencl::ClFullyConnected> op{nullptr};
 
-    const ITensor *original_weights{ nullptr };
+    const ITensor *original_weights{nullptr};
 
     ITensorPack                      run_pack{};
     WorkspaceData<CLTensor>          workspace{};
     experimental::MemoryRequirements aux_mem_req{};
 
-    bool is_prepared{ false };
-    bool dynamic_weights{ false };
+    bool is_prepared{false};
+    bool dynamic_weights{false};
 };
 
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(std::move(memory_manager));
@@ -58,39 +60,45 @@
 
 CLFullyConnectedLayer::~CLFullyConnectedLayer() = default;
 
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
 }
 
-void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
-                                                               biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info));
 
     _impl->op               = std::make_unique<opencl::ClFullyConnected>();
     _impl->original_weights = weights;
     _impl->is_prepared      = fc_info.retain_internal_weights;
 
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
 
-    if(_impl->weights_manager != nullptr)
+    if (_impl->weights_manager != nullptr)
     {
         _impl->weights_manager->manage(_impl->original_weights);
     }
 
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->aux_mem_req = _impl->op->workspace();
-        _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->workspace   = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
     else
     {
@@ -98,14 +106,14 @@
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
 
-    _impl->dynamic_weights =
-        !weights->info()->are_values_constant() &&
-        fc_info.transpose_weights &&
-        !fc_info.are_weights_reshaped &&
-        !fc_info.retain_internal_weights;
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
                                        FullyConnectedLayerInfo fc_info)
 {
     return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info);
@@ -113,7 +121,7 @@
 
 void CLFullyConnectedLayer::run()
 {
-    if(!_impl->dynamic_weights)
+    if (!_impl->dynamic_weights)
     {
         prepare();
     }
@@ -124,7 +132,7 @@
 
 void CLFullyConnectedLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -133,13 +141,13 @@
         _impl->is_prepared = true;
 
         // Handle weights managed infrastructure
-        if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
             // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
             // This is for cases where multiple functions share the same b (weights)
             // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
             const ITensor *original_b = _impl->original_weights;
-            if(!original_b->is_used())
+            if (!original_b->is_used())
             {
                 _impl->weights_manager->pre_mark_as_unused(original_b);
             }
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 7379e9d..e4fbf78 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -28,9 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -41,29 +41,52 @@
 
 CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
 
-void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const CLCompileContext    &compile_context,
+                                         const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
-    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
+    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
+                               bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void CLFuseBatchNormalization::run()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 427ea51..871a1d6 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemm.h"
 
@@ -40,15 +41,15 @@
 
 struct CLGEMM::Impl
 {
-    const ICLTensor              *b{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
-    IWeightsManager              *weights_manager{ nullptr };
+    IWeightsManager              *weights_manager{nullptr};
     ITensorPack                   run_pack{};
     ITensorPack                   prep_pack{};
     MemoryRequirements            aux_mem_req{};
     WorkspaceData<CLTensor>       workspace_tensors{};
-    bool                          is_prepared{ false };
+    bool                          is_prepared{false};
 };
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
@@ -60,12 +61,25 @@
 
 CLGEMM::~CLGEMM() = default;
 
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const ICLTensor *a,
+                       const ICLTensor *b,
+                       const ICLTensor *c,
+                       ICLTensor       *output,
+                       float            alpha,
+                       float            beta,
+                       const GEMMInfo  &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
 }
 
-void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *a,
+                       const ICLTensor        *b,
+                       const ICLTensor        *c,
+                       ICLTensor              *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -73,25 +87,33 @@
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         alpha, beta, gemm_info);
     _impl->aux_mem_req = _impl->op->workspace();
 
     // Manage/allocate auxilairy tensors
-    if(_impl->is_prepared)
+    if (_impl->is_prepared)
     {
         _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack  = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } };
-        _impl->prep_pack = { { ACL_SRC_1, _impl->b } };
+        _impl->run_pack  = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->prep_pack = {{ACL_SRC_1, _impl->b}};
 
-        _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info);
 }
@@ -107,15 +129,15 @@
 
 void CLGEMM::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->b->mark_as_unused();
         }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index c8c18f3..aef7cdd 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -27,10 +27,11 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "support/Cast.h"
@@ -47,18 +48,19 @@
 
 struct CLGEMMConvolutionLayer::Impl
 {
-    const ITensor                        *weights{ nullptr };
-    std::unique_ptr<opencl::ClGemmConv2d> op{ nullptr };
+    const ITensor                        *weights{nullptr};
+    std::unique_ptr<opencl::ClGemmConv2d> op{nullptr};
     ITensorPack                           run_pack{};
     ITensorPack                           prep_pack{};
     MemoryGroup                           memory_group{};
-    IWeightsManager                      *weights_manager{ nullptr };
+    IWeightsManager                      *weights_manager{nullptr};
     MemoryRequirements                    aux_mem_req{};
     WorkspaceData<CLTensor>               workspace_tensors{};
-    bool                                  is_prepared{ false };
+    bool                                  is_prepared{false};
 };
 
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                               IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(memory_manager);
@@ -67,40 +69,60 @@
 
 CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
 
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, num_groups);
 }
 
-void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                       const PadStrideInfo &conv_info,
-                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                       const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     _impl->weights               = weights;
     _impl->op                    = std::make_unique<opencl::ClGemmConv2d>();
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input },
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-        { TensorType::ACL_DST, output }
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input},
+                        {TensorType::ACL_SRC_1, weights},
+                        {TensorType::ACL_SRC_2, biases},
+                        {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {
+        {TensorType::ACL_SRC_1, weights},
+        {TensorType::ACL_SRC_2, biases},
     };
-    _impl->prep_pack =
-    {
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-    };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status CLGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        unsigned int               num_groups)
 {
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
     return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
@@ -115,14 +137,14 @@
 
 void CLGEMMConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->weights->mark_as_unused();
         }
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 9fc81c1..7d40cf1 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include <tuple>
 
@@ -40,12 +40,13 @@
 {
 namespace
 {
-std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+std::pair<Coordinates, Coordinates>
+compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
 {
     Coordinates start;
     Coordinates end;
 
-    if(is_nchw)
+    if (is_nchw)
     {
         start.set(0, deconv_info.pad_left());
         start.set(1, deconv_info.pad_top());
@@ -63,13 +64,16 @@
         end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
     }
 
-    return { start, end };
+    return {start, end};
 }
-Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       *input,
+                                       const ITensorInfo       *weights,
+                                       const ITensorInfo       *output,
+                                       GEMMLowpOutputStageInfo &output_stage_info)
 {
     const auto data_type = input->data_type();
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
@@ -78,7 +82,8 @@
         float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier(0);
         int   output_shift(0);
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
         output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -122,15 +127,21 @@
 
 CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
 
-Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                          const ITensorInfo   *weights,
+                                          const ITensorInfo   *bias,
+                                          const ITensorInfo   *output,
+                                          const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
 
     DataLayout data_layout  = input->data_layout();
-    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 ||
+                              deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
     const bool is_nchw      = input->data_layout() == DataLayout::NCHW;
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
@@ -144,21 +155,31 @@
     TensorShape nhwc_weights_shape = weights->tensor_shape();
     TensorShape nhwc_input_shape   = input->tensor_shape();
 
-    if(is_nchw)
+    if (is_nchw)
     {
         permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
         permute(nhwc_input_shape, PermutationVector(2, 0, 1));
 
-        TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_input_info = input->clone()
+                                         ->set_is_resizable(true)
+                                         .reset_padding()
+                                         .set_tensor_shape(nhwc_input_shape)
+                                         .set_data_layout(DataLayout::NCHW);
 
-        TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_weights_info = weights->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(nhwc_weights_shape)
+                                           .set_data_layout(DataLayout::NCHW);
 
         CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
         CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
     }
 
-    const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
-    const TensorInfo  reshaped_info  = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+    const TensorShape reshaped_shape =
+        TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+    const TensorInfo reshaped_info =
+        weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
 
     TensorShape      transposed_shape(reshaped_shape[1], reshaped_shape[0]);
@@ -166,77 +187,95 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
 
     TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
-                                  input->dimension(idx_w),
-                                  input->dimension(idx_h),
-                                  input->dimension(idx_b));
+                                  input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b));
 
     TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
     GEMMInfo   gemm_info(false, false, true, input->dimension(idx_h), true);
 
     GEMMLowpOutputStageInfo output_stage_info;
 
-    if(is_quantized)
+    if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
-                                                                           gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+            &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr,
+            &gemm_output_info.set_data_type(DataType::S32), gemm_info));
         ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true),
+                             &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
     }
 
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-    auto                out_dims           = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
-    const TensorShape   deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
-    TensorInfo          col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+    auto                out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                                   weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+    const TensorShape   deconv_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
 
-    if(padded_input && is_quantized)
+    if (padded_input && is_quantized)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(
+            &col2im_output_info, nullptr,
+            &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()),
+                              output, start_end.first, start_end.second));
     }
-    else if(padded_input)
+    else if (padded_input)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
     }
-    else if(is_quantized)
+    else if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
     }
 
     return Status{};
 }
 
-void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor     *input,
+                                         const ICLTensor     *weights,
+                                         const ICLTensor     *bias,
+                                         ICLTensor           *output,
+                                         const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
 }
 
-void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                         const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *weights,
+                                         const ICLTensor        *bias,
+                                         ICLTensor              *output,
+                                         const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
-                                                                  weights->info(),
-                                                                  bias != nullptr ? bias->info() : nullptr,
-                                                                  output->info(),
-                                                                  deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info);
 
     _original_weights = weights;
-    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
-    _is_nchw          = input->info()->data_layout() == DataLayout::NCHW;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 ||
+                    deconv_info.pad_top() > 0;
+    _is_nchw      = input->info()->data_layout() == DataLayout::NCHW;
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const ICLTensor *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
@@ -245,7 +284,7 @@
     // do an outer product in NCHW and then an accumulation through a reduction. This would have two
     // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
     // might be slower than GEMM.
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
@@ -257,10 +296,11 @@
     }
 
     // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
-    _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
-                                                               weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
-                                                   1,
-                                                   input->info()->data_type(), weights->info()->quantization_info()));
+    _reshaped_weights.allocator()->init(
+        TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) *
+                                                                         weights_to_use->info()->dimension(2) *
+                                                                         weights_to_use->info()->dimension(3)),
+                   1, input->info()->data_type(), weights->info()->quantization_info()));
 
     _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
     _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
@@ -269,15 +309,17 @@
     GEMMInfo     gemm_info(false, false, true, input->info()->dimension(idx_h), true);
 
     // Configure output stage for asymmetric quantized types
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original
         // and restore them back to make it work properly.
         QuantizationInfo iq_info = input->info()->quantization_info();
         QuantizationInfo wq_info = weights->info()->quantization_info();
 
-        input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
-        _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
+        input_to_use->info()->set_quantization_info(
+            QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
+        _reshaped_weights_t.info()->set_quantization_info(
+            QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
 
         _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
 
@@ -286,10 +328,11 @@
     }
     else
     {
-        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f,
+                           gemm_info);
     }
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -298,7 +341,7 @@
     ICLTensor *slice_output          = nullptr;
     ICLTensor *output_stage_output   = nullptr;
 
-    if(_padded_input && _is_quantized)
+    if (_padded_input && _is_quantized)
     {
         _memory_group.manage(&_slice_gemm_input);
         _memory_group.manage(&_gemmlowp_final);
@@ -306,13 +349,13 @@
         output_stage_output   = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_padded_input)
+    else if (_padded_input)
     {
         _memory_group.manage(&_slice_gemm_input);
         deconv_reshape_output = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_is_quantized)
+    else if (_is_quantized)
     {
         _memory_group.manage(&_gemmlowp_final);
         deconv_reshape_output = &_gemmlowp_final;
@@ -324,21 +367,24 @@
     }
 
     // Configure a Col2Im call to reshape the output of GEMM
-    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(),
+                               weights->info(), deconv_info);
     _gemm_output.allocator()->allocate();
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         GEMMLowpOutputStageInfo output_stage_info;
         construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
-        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
+        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output,
+                                         output_stage_info);
         _gemmlowp_final.allocator()->allocate();
     }
 
     // If the input was padded, the output needs to be sliced.
-    if(_padded_input)
+    if (_padded_input)
     {
-        const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+        const auto start_end =
+            compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
         _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
         _slice_gemm_input.allocator()->allocate();
     }
@@ -350,12 +396,12 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input_to_nhwc.run();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _mm_gemmlowp.run();
     }
@@ -366,12 +412,12 @@
 
     CLScheduler::get().enqueue(*_deconv_reshape, false);
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _gemmlowp_output_stage.run();
     }
 
-    if(_padded_input)
+    if (_padded_input)
     {
         _slice_gemm.run();
     }
@@ -379,11 +425,11 @@
 
 void CLGEMMDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->allocate();
             _permute_weights_to_nhwc.run();
@@ -392,7 +438,7 @@
         _reshaped_weights.allocator()->allocate();
         _reshape_weights.run();
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->free();
         }
@@ -401,7 +447,7 @@
         _transpose_weights.run();
 
         // Prepare gemm
-        if(!_is_quantized)
+        if (!_is_quantized)
         {
             _mm_gemm.prepare();
         }
@@ -411,7 +457,7 @@
         }
 
         // Free resources
-        if(!_reshaped_weights_t.is_used())
+        if (!_reshaped_weights_t.is_used())
         {
             _reshaped_weights_t.allocator()->free();
         }
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index d902947..8bad198 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -31,12 +31,12 @@
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/IMemoryManager.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 
 namespace arm_compute
@@ -46,13 +46,13 @@
 
 struct CLGEMMLowpMatrixMultiplyCore::Impl
 {
-    const ICLTensor              *b{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
     ITensorPack                   run_pack{};
     MemoryRequirements            aux_mem_req{};
     WorkspaceData<CLTensor>       workspace_tensors{};
-    bool                          is_prepared{ false };
+    bool                          is_prepared{false};
 };
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
@@ -63,12 +63,18 @@
 
 CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(
+    const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
 }
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+                                             const ICLTensor        *a,
+                                             const ICLTensor        *b,
+                                             const ICLTensor        *c,
+                                             ICLTensor              *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -76,23 +82,29 @@
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         gemm_info);
     _impl->aux_mem_req = _impl->op->workspace();
 
     // Manage/allocate auxilairy tensors
-    if(_impl->is_prepared)
+    if (_impl->is_prepared)
     {
         _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack          = { { ACL_SRC_0, a }, { ACL_SRC_1, _impl->b }, { ACL_SRC_2, c }, { ACL_DST, output } };
-        _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
 }
 
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     return OperatorType::validate(a, b, c, output, gemm_info);
 }
@@ -108,7 +120,7 @@
 
 void CLGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 6feed0d..3dd8c5f 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -40,27 +40,33 @@
 {
 struct CLGEMMLowpOutputStage::Impl
 {
-    const ICLTensor                               *src{ nullptr };
-    const ICLTensor                               *bias{ nullptr };
-    ICLTensor                                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{ nullptr };
+    const ICLTensor                               *src{nullptr};
+    const ICLTensor                               *bias{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr};
     ITensorPack                                    run_pack{};
 };
 
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage()
-    : _impl(std::make_unique<Impl>())
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
 }
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&)            = default;
 CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default;
 CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage()                                   = default;
 
-void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
 }
 
-void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -69,11 +75,15 @@
     _impl->dst  = output;
 
     _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>();
-    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info);
-    _impl->run_pack = { { ACL_SRC, _impl->src }, { ACL_BIAS, _impl->bias }, { ACL_DST, _impl->dst } };
+    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(),
+                         info);
+    _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}};
 }
 
-Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status CLGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info);
 }
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index 033c117..2610cb1 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/kernels/CLGatherKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,11 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGather::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *input,
+                         const ICLTensor        *indices,
+                         ICLTensor              *output,
+                         int                     axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
     auto k = std::make_unique<CLGatherKernel>();
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 9cb7d61..b2c1d26 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -71,48 +71,67 @@
 
 CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
 
-void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+void CLGenerateProposalsLayer::configure(const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out,
+              num_valid_proposals, info);
 }
 
-void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
-                                         ICLTensor *scores_out,
-                                         ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
+void CLGenerateProposalsLayer::configure(const CLCompileContext      &compile_context,
+                                         const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
+                                         const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
     ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors,
+                                       ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
         _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -126,10 +145,10 @@
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1});
         _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -140,7 +159,7 @@
 
     CLTensor *anchors_to_use = &_all_anchors;
     CLTensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -163,11 +182,12 @@
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
         _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
@@ -183,7 +203,8 @@
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -195,20 +216,27 @@
     _num_valid_proposals = num_valid_proposals;
 
     _memory_group.manage(&_proposals_4_roi_values);
-    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
-                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values,
+                       &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height()));
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
     _all_proposals_to_use->allocator()->allocate();
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status CLGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -216,9 +244,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -227,76 +258,101 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
         ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
 
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                   BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -309,7 +365,7 @@
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -356,7 +412,7 @@
     CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _permute_deltas.run();
         _permute_scores.run();
@@ -364,7 +420,7 @@
     _flatten_deltas.run();
     _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _dequantize_anchors->run();
         _dequantize_deltas->run();
@@ -373,7 +429,7 @@
     // Build the boxes
     CLScheduler::get().enqueue(*_bounding_box_kernel, false);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _quantize_all_proposals->run();
     }
diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
index 90af36a..1a2369c 100644
--- a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
@@ -26,36 +26,45 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/gpu/cl/operators/ClIndirectConv2d.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLIndirectConvolutionLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    const ICLTensor                          *weights{ nullptr };
-    const ICLTensor                          *biases{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClIndirectConv2d> op{ nullptr };
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr};
 };
 
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer()
-    : _impl(std::make_unique<Impl>())
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default;
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&)            = default;
 CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default;
 CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer()                                        = default;
 
-void CLIndirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -65,10 +74,15 @@
     _impl->biases  = biases;
     _impl->dst     = output;
     _impl->op      = std::make_unique<opencl::ClIndirectConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLIndirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
                                             const ActivationLayerInfo &act_info)
 {
     return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -83,4 +97,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 5feafe1..0e994e1 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -27,50 +27,62 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT
-    : _inst_norm_kernel(),
-      _mean_var_kernel(),
-      _mean_var_tensor(),
-      _ctx(ctx)
+    : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx)
 {
 }
 CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer()
 {
 }
 
-void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(
+    ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
 }
 
-void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             float                   gamma,
+                                             float                   beta,
+                                             float                   epsilon,
+                                             bool                    use_mixed_precision)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision);
     auto w = std::make_unique<CLComputeMeanVariance>();
     w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision);
     _mean_var_kernel = std::move(w);
     auto k           = std::make_unique<CLInstanceNormalizationLayerKernel>();
-    k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    k->configure(compile_context, input, &_mean_var_tensor, output,
+                 InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
     _inst_norm_kernel = std::move(k);
     _mean_var_tensor.allocator()->allocate();
 }
 
-Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input,
+                                              const ITensorInfo *output,
+                                              float              gamma,
+                                              float              beta,
+                                              float              epsilon,
+                                              bool               use_mixed_precision)
 {
-    return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    return CLInstanceNormalizationLayerKernel::validate(
+        input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
 }
 
 void CLInstanceNormalizationLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured");
+    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel,
+                             "The child class didn't set the CL kernel or function isn't configured");
     schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get());
     schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get());
 }
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 1278385..4fe1d9b 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -29,12 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
@@ -57,7 +57,8 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayer::configure(
+    const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
 
@@ -86,7 +87,8 @@
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index ea08bec..3b50234 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -40,54 +40,155 @@
 using namespace arm_compute::utils::info_helpers;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(),
-      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(),
-      _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(),
-      _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(),
-      _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(),
-      _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(),
-      _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(),
-      _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(),
-      _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(),
-      _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false),
-      _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _ones_fill(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
+      _is_layer_norm_lstm(false)
 {
 }
 
 CLLSTMLayer::~CLLSTMLayer() = default;
 
-void CLLSTMLayer::configure(const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in,
+              cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
               cell_threshold, projection_threshold);
 }
 
-void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const CLCompileContext      &compile_context,
+                            const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
 
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                           recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out,
-                           output, lstm_params, activation_info, cell_threshold, projection_threshold);
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
 
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
@@ -96,13 +197,12 @@
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
     // Configure block that calculates the forget gate
@@ -126,26 +226,31 @@
 
     weights_vector.emplace_back(input_to_forget_weights);
     weights_vector.emplace_back(recurrent_to_forget_weights);
-    const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+    const TensorShape weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
     _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
 
     _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     CLTensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                             &_forget_gate_out4, 1, ConvertPolicy::SATURATE,
+                                             RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -154,22 +259,25 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out,
+                                                   lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias,
+                                          &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -178,12 +286,13 @@
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     CLTensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
-        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1,
+                                       ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -195,7 +304,8 @@
         std::vector<const ICLTensor *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
         _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
 
         _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX);
@@ -203,15 +313,20 @@
         _memory_group.manage(&_input_gate_out1);
 
         _memory_group.manage(&_input_gate_out3);
-        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
 
         input_gate_out = &_input_gate_out3;
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                &_input_gate_out4, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -221,22 +336,25 @@
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(compile_context, input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                      RoundingPolicy::TO_NEAREST_EVEN);
+            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out,
+                                                      lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1,
+                                                      1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(compile_context, input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -249,44 +367,54 @@
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights,
+                                          (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info());
     _recurrent_to_cell_weights = recurrent_to_cell_weights;
     _memory_group.manage(&_cell_state_out3);
-    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f,
+                                0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4,
+                                 ConvertPolicy::SATURATE);
     CLTensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                 RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr,
+                                                 lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1,
+                                                 ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1,
+                                 ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
+        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -298,7 +426,8 @@
     std::vector<const ICLTensor *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
     _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
 
     _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX);
@@ -306,18 +435,20 @@
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2,
+                                      (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     CLTensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(),
+                                               &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -329,22 +460,25 @@
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(compile_context, output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out,
+                                                   lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias,
+                                          &_output_layer_norm_out2, ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(compile_context, output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -361,19 +495,24 @@
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out,
+                                           output_state_out_tmp, 1, ConvertPolicy::SATURATE,
+                                           RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_activation.allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -383,7 +522,7 @@
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ICLTensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -397,29 +536,38 @@
     output_gate_out->allocator()->allocate();
 }
 
-Status CLLSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status CLLSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -438,16 +586,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -459,8 +607,12 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -470,7 +622,7 @@
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -488,36 +640,42 @@
     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -525,88 +683,121 @@
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
-                                                                                                              -cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
     // Validate output gate tmp
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                    1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_NEAREST_EVEN));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out,
-                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
@@ -616,7 +807,7 @@
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -638,12 +829,12 @@
 
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
         _pixelwise_mul_forget_gate_coeff.run();
@@ -651,7 +842,7 @@
     }
     _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
         _ones_fill.run();
         _subtract_input_gate.run();
@@ -660,13 +851,13 @@
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
             _pixelwise_mul_input_gate_coeff.run();
@@ -679,12 +870,10 @@
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights);
     pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2);
-    CLScheduler::get().enqueue_op(*_transpose_cell_state,
-                                  pack,
-                                  false);
+    CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false);
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
         _pixelwise_mul_cell_gate_coeff.run();
@@ -695,19 +884,19 @@
     _pixelwise_mul_cell_state2.run();
     _accum_cell_state2.run();
 
-    if(_perform_cell_clipping)
+    if (_perform_cell_clipping)
     {
         _cell_clip.run();
     }
 
     _fully_connected_output.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
         _pixelwise_mul_output_gate_coeff.run();
@@ -718,10 +907,10 @@
     _activation_output_state.run();
     _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
             _projection_clip.run();
         }
@@ -735,10 +924,10 @@
 
 void CLLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _concat_weights_forget_gate.run();
-        if(!_run_cifg_opt)
+        if (!_run_cifg_opt)
         {
             _concat_weights_input_gate.run();
         }
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index d14c610..ea64eda 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -25,12 +25,12 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 
@@ -46,48 +46,129 @@
 } // namespace
 
 CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
-      _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
-      _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
-      _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
-      _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
-      _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
-      _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add_cell_state_tmps(),
+      _add2(),
+      _mul_forget_gate_cell_state(),
+      _mul_input_gate_input_mod_gate(),
+      _mul_output_state_tmp_output_gate(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state_tmp1(),
+      _cell_state_tmp2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
+      _is_prepared(false)
 {
 }
 
 void CLLSTMLayerQuantized::configure(const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+                                     const ICLTensor *input_to_input_weights,
+                                     const ICLTensor *input_to_forget_weights,
+                                     const ICLTensor *input_to_cell_weights,
+                                     const ICLTensor *input_to_output_weights,
+                                     const ICLTensor *recurrent_to_input_weights,
+                                     const ICLTensor *recurrent_to_forget_weights,
+                                     const ICLTensor *recurrent_to_cell_weights,
+                                     const ICLTensor *recurrent_to_output_weights,
+                                     const ICLTensor *input_gate_bias,
+                                     const ICLTensor *forget_gate_bias,
+                                     const ICLTensor *cell_bias,
+                                     const ICLTensor *output_gate_bias,
+                                     ICLTensor       *cell_state_in,
+                                     const ICLTensor *output_state_in,
+                                     ICLTensor       *cell_state_out,
+                                     ICLTensor       *output_state_out)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
-              output_state_out);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias,
+              output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
 }
 
-void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context,
+                                     const ICLTensor        *input,
+                                     const ICLTensor        *input_to_input_weights,
+                                     const ICLTensor        *input_to_forget_weights,
+                                     const ICLTensor        *input_to_cell_weights,
+                                     const ICLTensor        *input_to_output_weights,
+                                     const ICLTensor        *recurrent_to_input_weights,
+                                     const ICLTensor        *recurrent_to_forget_weights,
+                                     const ICLTensor        *recurrent_to_cell_weights,
+                                     const ICLTensor        *recurrent_to_output_weights,
+                                     const ICLTensor        *input_gate_bias,
+                                     const ICLTensor        *forget_gate_bias,
+                                     const ICLTensor        *cell_bias,
+                                     const ICLTensor        *output_gate_bias,
+                                     ICLTensor              *cell_state_in,
+                                     const ICLTensor        *output_state_in,
+                                     ICLTensor              *cell_state_out,
+                                     ICLTensor              *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
 
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
                            output_state_out);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -95,8 +176,10 @@
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -124,17 +207,20 @@
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
     std::vector<const ICLTensor *> weights_vector;
     weights_vector.emplace_back(&_recurrent_weights);
     weights_vector.emplace_back(&_input_weights);
 
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
 
@@ -144,7 +230,8 @@
     input_vector.emplace_back(output_state_in);
 
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
 
     // Bias concatenation
@@ -159,7 +246,8 @@
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -169,7 +257,8 @@
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -191,85 +280,111 @@
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0},
+                                      {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input,
+                                     {2 * output_size, 0}, {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size},
+                                       {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size},
+                                       {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state_tmp1);
-    _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1,
+                                          ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state_tmp2);
-    _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output,
+                                             &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
-    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out,
+                                   ConvertPolicy::SATURATE);
     _cell_state_tmp1.allocator()->allocate();
     _cell_state_tmp2.allocator()->allocate();
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output,
+                                                &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -278,15 +393,28 @@
 }
 
 Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
 
     const int input_size  = input->dimension(0);
@@ -299,29 +427,51 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                       recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                       recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights,
+                                                              input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -343,7 +493,8 @@
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -353,7 +504,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -379,7 +530,8 @@
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -390,7 +542,8 @@
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int         output_multiplier = 0;
     int         output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
     GEMMLowpOutputStageInfo info{};
@@ -405,68 +558,91 @@
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -475,14 +651,14 @@
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -541,7 +717,7 @@
 
 void CLLSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
index 696191c..ea21c54 100644
--- a/src/runtime/CL/functions/CLLogicalAnd.cpp
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,7 +34,10 @@
 {
 namespace experimental
 {
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *input1,
+                             ITensorInfo            *input2,
+                             ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@
 
 struct CLLogicalAnd::Impl
 {
-    const ICLTensor                            *src0{ nullptr };
-    const ICLTensor                            *src1{ nullptr };
-    ICLTensor                                  *dst{ nullptr };
-    std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr };
+    const ICLTensor                            *src0{nullptr};
+    const ICLTensor                            *src1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalAnd> op{nullptr};
 };
 
-CLLogicalAnd::CLLogicalAnd()
-    : _impl(std::make_unique<Impl>())
+CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
+CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&)            = default;
 CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default;
 CLLogicalAnd::~CLLogicalAnd()                          = default;
 
@@ -73,7 +76,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output)
 {
     _impl->src0 = input1;
     _impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
index a0504d7..71f9cce 100644
--- a/src/runtime/CL/functions/CLLogicalNot.cpp
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClLogicalNot.h"
 
@@ -32,16 +33,15 @@
 {
 struct CLLogicalNot::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClLogicalNot> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClLogicalNot> op{nullptr};
 };
 
-CLLogicalNot::CLLogicalNot()
-    : _impl(std::make_unique<Impl>())
+CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
+CLLogicalNot::CLLogicalNot(CLLogicalNot &&)            = default;
 CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default;
 CLLogicalNot::~CLLogicalNot()                          = default;
 
@@ -72,4 +72,4 @@
     _impl->op->run(pack);
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
index f9a606e..3db4fda 100644
--- a/src/runtime/CL/functions/CLLogicalOr.cpp
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,7 +34,10 @@
 {
 namespace experimental
 {
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ITensorInfo            *input1,
+                            ITensorInfo            *input2,
+                            ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@
 
 struct CLLogicalOr::Impl
 {
-    const ICLTensor                           *src0{ nullptr };
-    const ICLTensor                           *src1{ nullptr };
-    ICLTensor                                 *dst{ nullptr };
-    std::unique_ptr<experimental::CLLogicalOr> op{ nullptr };
+    const ICLTensor                           *src0{nullptr};
+    const ICLTensor                           *src1{nullptr};
+    ICLTensor                                 *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalOr> op{nullptr};
 };
 
-CLLogicalOr::CLLogicalOr()
-    : _impl(std::make_unique<Impl>())
+CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
+CLLogicalOr::CLLogicalOr(CLLogicalOr &&)            = default;
 CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default;
 CLLogicalOr::~CLLogicalOr()                         = default;
 
@@ -73,7 +76,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ICLTensor              *input1,
+                            ICLTensor              *input2,
+                            ICLTensor              *output)
 {
     _impl->src0 = input1;
     _impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index bef422f..e8bdad7 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
+
 #include "src/gpu/cl/operators/ClMatMul.h"
 
 namespace arm_compute
@@ -32,23 +34,32 @@
 
 struct CLMatMul::Impl
 {
-    std::unique_ptr<OperatorType> op{ nullptr };
+    std::unique_ptr<OperatorType> op{nullptr};
     ITensorPack                   run_pack{};
 };
-CLMatMul::CLMatMul()
-    : _impl(std::make_unique<Impl>())
+CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>())
 {
 }
 
 CLMatMul::~CLMatMul() = default;
 
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CLMatMul::configure(ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(settings);
     configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
 }
 
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+void CLMatMul::configure(const CLCompileContext    &compile_context,
+                         ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
@@ -56,10 +67,14 @@
 
     _impl->op = std::make_unique<OperatorType>();
     _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
-    _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
+    _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
 }
 
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status CLMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
 {
     return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
 }
diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
index 2786d32..7494f37 100644
--- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
@@ -27,26 +27,32 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 namespace arm_compute
 {
 CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
-    : _fill(),
-      _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
+    : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
 {
 }
 
 CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
 
-void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
 }
 
-void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context,
+                                    ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
     const PixelValue zero_value(0.f);
@@ -55,7 +61,10 @@
     _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
 }
 
-Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
 {
     return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
 }
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index a81cbca..5892c0e 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                               ICLTensor              *input,
+                                               ICLTensor              *output,
+                                               float                   epsilon)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
     auto k = std::make_unique<CLMeanStdDevNormalizationKernel>();
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index c0cc518..f93f82f 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
@@ -50,7 +50,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+void CLNormalizationLayer::configure(const CLCompileContext       &compile_context,
+                                     ICLTensor                    *input,
+                                     ICLTensor                    *output,
+                                     const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
     ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
@@ -58,21 +61,24 @@
     // Configure normalization kernel
     _norm_kernel->configure(compile_context, input, output, norm_info);
 
-    if(!_norm_kernel->border_size().empty())
+    if (!_norm_kernel->border_size().empty())
     {
         // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT,
+                                   PixelValue());
     }
 }
 
-Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status CLNormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     return CLNormalizationLayerKernel::validate(input, output, norm_info);
 }
 
 void CLNormalizationLayer::run()
 {
-    if(!_norm_kernel->border_size().empty())
+    if (!_norm_kernel->border_size().empty())
     {
         // Run border handler
         CLScheduler::get().enqueue(*_border_handler, false);
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 63c9164..939c95b 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -24,20 +24,26 @@
 
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
 
-#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input,
+                                          ICLTensor       *output,
+                                          const ICLTensor *mean,
+                                          const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *std)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, std);
     auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>();
@@ -45,8 +51,10 @@
     _kernel = std::move(k);
 }
 
-Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input,
+                                           const ITensorInfo *output,
+                                           const ITensorInfo *mean,
+                                           const ITensorInfo *std)
 {
     return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
 }
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index 186e7b4..ce6d285 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/operators/ClPRelu.h"
 
@@ -33,17 +35,16 @@
 
 struct CLPReluLayer::Impl
 {
-    const ICLTensor              *src_0{ nullptr };
-    const ICLTensor              *src_1{ nullptr };
-    ICLTensor                    *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *src_0{nullptr};
+    const ICLTensor              *src_1{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
 };
 
-CLPReluLayer::CLPReluLayer()
-    : _impl(std::make_unique<Impl>())
+CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&)            = default;
 CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
 CLPReluLayer::~CLPReluLayer()                          = default;
 
@@ -52,13 +53,17 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
 }
 
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *alpha,
+                             ICLTensor              *output)
 {
     _impl->src_0 = input;
     _impl->src_1 = alpha;
     _impl->dst   = output;
     _impl->op    = std::make_unique<OperatorType>();
-    _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info()));
+    _impl->op->configure(compile_context, input->info(), alpha->info(),
+                         (output == nullptr ? input->info() : output->info()));
 }
 
 Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 0ed8f03..e788ded 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -22,37 +22,38 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
-#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 namespace arm_compute
 {
-CLPadLayer::CLPadLayer()
-    : _pad_kernel(std::make_unique<CLPadLayerKernel>()),
-      _copy(),
-      _perform_pad(false)
+CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false)
 {
 }
 
 CLPadLayer::~CLPadLayer() = default;
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(
+    ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(const CLCompileContext &compile_context,
+                           ICLTensor              *input,
+                           ICLTensor              *output,
+                           const PaddingList      &padding,
+                           PixelValue              constant_value,
+                           PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
     ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
 
-    _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    _perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(_perform_pad)
+    if (_perform_pad)
     {
         _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
     }
@@ -62,14 +63,16 @@
         _copy.configure(compile_context, input, output);
     }
 }
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            PixelValue         constant_value,
+                            PaddingMode        mode)
 {
-    bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    bool perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(perform_pad)
+    if (perform_pad)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
     }
@@ -81,7 +84,7 @@
 }
 void CLPadLayer::run()
 {
-    if(_perform_pad)
+    if (_perform_pad)
     {
         CLScheduler::get().enqueue(*_pad_kernel);
     }
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index a56afff..7f97eed 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -27,22 +27,21 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClPermute.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
 
 namespace arm_compute
 {
 struct CLPermute::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClPermute> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClPermute> op{nullptr};
 };
 
-CLPermute::CLPermute()
-    : _impl(std::make_unique<Impl>())
+CLPermute::CLPermute() : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -53,7 +52,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
 }
 
-void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+void CLPermute::configure(const CLCompileContext  &compile_context,
+                          const ICLTensor         *input,
+                          ICLTensor               *output,
+                          const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, perm);
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 9d91e58..6aa9d9c 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClMul.h"
 
@@ -34,38 +35,55 @@
 {
 struct CLPixelWiseMultiplication::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClMul> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClMul> op{nullptr};
 };
 
-CLPixelWiseMultiplication::CLPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
-CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&)            = default;
 CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default;
 CLPixelWiseMultiplication::~CLPixelWiseMultiplication()                                       = default;
 
-void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy,
+              rounding_policy, act_info);
 }
 
-void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                          ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
     _impl->op    = std::make_unique<opencl::ClMul>();
-    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy,
+                         rounding_policy, act_info);
 }
 
-Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status CLPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
+                                           const ActivationLayerInfo &act_info)
 {
     return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
@@ -82,26 +100,33 @@
 
 struct CLComplexPixelWiseMultiplication::Impl
 {
-    const ICLTensor                      *src_0{ nullptr };
-    const ICLTensor                      *src_1{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClComplexMul> op{ nullptr };
+    const ICLTensor                      *src_0{nullptr};
+    const ICLTensor                      *src_1{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClComplexMul> op{nullptr};
 };
 
-CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()                                              = default;
+CLComplexPixelWiseMultiplication &
+CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()            = default;
 
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                                 ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -110,7 +135,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClComplexMul::validate(input1, input2, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp
index 11ae1d0..ce1092a 100644
--- a/src/runtime/CL/functions/CLPooling3dLayer.cpp
+++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClPool3d.h"
 
@@ -32,14 +33,13 @@
 {
 struct CLPooling3dLayer::Impl
 {
-    const ICLTensor                  *src{ nullptr };
-    ICLTensor                        *dst{ nullptr };
-    ICLTensor                        *indices{ nullptr };
-    std::unique_ptr<opencl::ClPool3d> op{ nullptr };
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool3d> op{nullptr};
 };
 
-CLPooling3dLayer::CLPooling3dLayer()
-    : _impl(std::make_unique<Impl>())
+CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLPooling3dLayer::~CLPooling3dLayer() = default;
@@ -49,7 +49,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info);
 }
 
-void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info)
+void CLPooling3dLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 ICLTensor                *output,
+                                 const Pooling3dLayerInfo &pool_info)
 {
     _impl->src = input;
     _impl->dst = output;
@@ -58,7 +61,8 @@
     _impl->op->configure(compile_context, input->info(), output->info(), pool_info);
 }
 
-Status CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+Status
+CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
 {
     return opencl::ClPool3d::validate(input, output, pool_info);
 }
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 0ebce31..65e53b9 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClPool2d.h"
 
@@ -32,34 +33,44 @@
 {
 struct CLPoolingLayer::Impl
 {
-    const ICLTensor                  *src{ nullptr };
-    ICLTensor                        *dst{ nullptr };
-    ICLTensor                        *indices{ nullptr };
-    std::unique_ptr<opencl::ClPool2d> op{ nullptr };
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool2d> op{nullptr};
 };
 
-CLPoolingLayer::CLPoolingLayer()
-    : _impl(std::make_unique<Impl>())
+CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLPoolingLayer::~CLPoolingLayer() = default;
 
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
 }
 
-void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(const CLCompileContext &compile_context,
+                               ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
 {
     _impl->src     = input;
     _impl->dst     = output;
     _impl->indices = indices;
 
     _impl->op = std::make_unique<opencl::ClPool2d>();
-    _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
+    _impl->op->configure(compile_context, input->info(), output->info(), pool_info,
+                         (indices) ? indices->info() : nullptr);
 }
 
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CLPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     return opencl::ClPool2d::validate(input, output, pool_info, indices);
 }
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 019f0a7..cfd0ec4 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -29,31 +29,40 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute;
 
-CLPriorBoxLayer::CLPriorBoxLayer()
-    : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
 {
 }
 
-void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
 }
 
-void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const CLCompileContext  &compile_context,
+                                const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
-    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
-    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
-    if(!info.max_sizes().empty())
+    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.min_sizes().size() * sizeof(float));
+    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.aspect_ratios().size() * sizeof(float));
+    if (!info.max_sizes().empty())
     {
-        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                          info.max_sizes().size() * sizeof(float));
     }
 
     auto k = std::make_unique<CLPriorBoxLayerKernel>();
@@ -61,7 +70,10 @@
     _kernel = std::move(k);
 }
 
-Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
 }
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 7fbb866..12f6f89 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -26,29 +26,36 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 using namespace arm_compute::utils::info_helpers;
 using namespace arm_compute::opencl::kernels;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
@@ -78,14 +85,12 @@
     _src->map(q, true);
     _dst->map(q, true);
 
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 
     _src->unmap(q);
     _dst->unmap(q);
@@ -104,7 +109,7 @@
       _layer_norms(),
       _copy_output()
 {
-    for(auto &norm : _layer_norms)
+    for (auto &norm : _layer_norms)
     {
         norm = std::make_unique<CLQLSTMLayerNormalizationKernel>();
     }
@@ -129,17 +134,22 @@
 {
     // Output quantization scale will be different, but ignored here
     // since it will be configured at configure() stage.
-    const TensorInfo out
-    {
-        in
-    };
+    const TensorInfo out{in};
     return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
 }
 
-void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
-                                CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void CLQLSTMLayer::configure_mm(const CLCompileContext       &compile_context,
+                                CLGEMMLowpMatrixMultiplyCore &mm,
+                                CLGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ICLTensor              *mm_input,
+                                const ICLTensor              *mm_weights,
+                                const ICLTensor              *bias,
+                                CLTensor                     *mm_res,
+                                CLTensor                     *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -151,30 +161,51 @@
     mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void CLQLSTMLayer::configure(const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
-              cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+              output_state_in, cell_state_out, output_state_out, output, lstm_params);
 }
 
-void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const CLCompileContext      &compile_context,
+                             const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
@@ -191,11 +222,11 @@
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                      cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                      lstm_params_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info));
 
     const int batch_size  = input->info()->dimension(1);
     const int num_units   = input_to_output_weights->info()->dimension(1);
@@ -216,7 +247,7 @@
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -238,53 +269,75 @@
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                                 -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(),
+                                             _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                              -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                            true));
-    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                              -qoutput_state_in.offset, true));
-    if(_has_projection)
+    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(),
+                                          _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(),
+                                          _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
-        _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
-        if(_projection_bias != nullptr)
+        _projection_reduction->configure(
+            compile_context, _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
         {
-            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias,
+                                           &_projection_eff_bias, ConvertPolicy::SATURATE);
         }
     }
 
     // Pre-transpose weights to be used in GEMM.
-    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
-    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
-    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
-    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights,
+                                                 &_input_to_forget_weights_transposed);
+    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights,
+                                               &_input_to_cell_weights_transposed);
+    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights,
+                                                 &_input_to_output_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights,
+                                                   &_recurrent_to_cell_weights_transposed);
+    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
     }
@@ -297,42 +350,55 @@
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
                  &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
                  mm_out_info, forget_gate_outstage_info);
 
-    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr,
+                                           &_cell_to_forget_outstage_res, gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res,
+                                          &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                           ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);
         _recurrent_to_forget_outstage_res.allocator()->allocate();
@@ -345,30 +411,33 @@
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input,
+                 &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res,
+                 &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info);
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
 
-    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
                                                      ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);
         _recurrent_to_cell_outstage_res.allocator()->allocate();
@@ -378,14 +447,15 @@
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
         _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -393,107 +463,142 @@
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
 
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
         configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
                      output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
-                                                    ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr,
+                                              &_cell_to_input_outstage_res, gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);
             _recurrent_to_input_outstage_res.allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
-    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f,
+                                        ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
-    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out,
+                               ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(compile_context, cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
 
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
                  &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
                  mm_out_info, output_outstage_info);
 
-    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                                 &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
 
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr,
+                                           &_cell_to_output_outstage_res, gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                             &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                              ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);
         _recurrent_to_output_outstage_res.allocator()->allocate();
@@ -503,20 +608,24 @@
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f,
+                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -525,7 +634,7 @@
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -536,27 +645,26 @@
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ICLTensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
             _projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -565,31 +673,34 @@
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination,
+                                         accumulate_destination, ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                       quantized_projection_clip));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -600,17 +711,27 @@
     _copy_output.configure(compile_context, output_state_out, output);
 }
 
-Status CLQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status CLQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -622,13 +743,16 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -647,20 +771,25 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -674,7 +803,7 @@
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -682,33 +811,50 @@
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                               lstm_params.hidden_state_zero(),
-                                                                               true)));
-        if(lstm_params.projection_bias() != nullptr)
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
         {
             ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
-                                                                       &projection_eff_bias_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
         }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1,
+                                              input_to_forget_weights->data_type(),
+                                              input_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
     // Validate weights transpose
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));
@@ -717,15 +863,20 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -738,28 +889,42 @@
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -770,20 +935,29 @@
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
 
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -791,85 +965,123 @@
     }
 
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+            input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
 
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                            RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_outstage_info, &input_gate_info,
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
 
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -877,85 +1089,103 @@
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = hidden_out_info.data_type();
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -980,14 +1210,14 @@
     _recurrent_to_forget_outstage.run();
     _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
         _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));
     }
@@ -1002,7 +1232,7 @@
     _recurrent_to_cell_outstage.run();
     _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));
     }
@@ -1010,7 +1240,7 @@
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _input_gate_sub.run();
     }
@@ -1022,14 +1252,14 @@
         _recurrent_to_input_outstage.run();
         _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
             _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
             _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));
         }
@@ -1041,7 +1271,7 @@
     _pixelwise_mul_forget_cell.run();
     _pixelwise_mul_input_cell.run();
     _add_forget_cell.run();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -1052,14 +1282,14 @@
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
     _accumulate_input_recurrent_output.run();
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
         _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));
     }
@@ -1072,31 +1302,31 @@
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
         _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
@@ -1108,7 +1338,7 @@
 
 void CLQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Pre-transpose weights to be used in GEMM.
         _input_to_forget_weights_transposed.allocator()->allocate();
@@ -1125,10 +1355,11 @@
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
             _ones.map(true);
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
             _ones.unmap();
         }
         else
@@ -1136,10 +1367,12 @@
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
 
-            ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } };
+            ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights},
+                                                   {ACL_DST, &_input_to_input_eff_bias}};
             CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false);
 
-            ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } };
+            ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights},
+                                                 {ACL_DST, &_recurrent_to_input_eff_bias}};
             CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false);
 
             _input_to_input_weights_transposed.allocator()->allocate();
@@ -1156,30 +1389,35 @@
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
 
-        ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } };
+        ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights},
+                                                {ACL_DST, &_input_to_forget_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false);
 
-        ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } };
+        ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights},
+                                              {ACL_DST, &_recurrent_to_forget_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false);
 
-        ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } };
+        ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false);
 
-        ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } };
+        ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights},
+                                            {ACL_DST, &_recurrent_to_cell_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false);
 
-        ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } };
+        ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights},
+                                                {ACL_DST, &_input_to_output_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false);
 
-        ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } };
+        ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights},
+                                              {ACL_DST, &_recurrent_to_output_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false);
 
-        if(_has_projection)
+        if (_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } };
+            ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}};
             CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false);
-            if(_projection_bias != nullptr)
+            if (_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
@@ -1189,7 +1427,7 @@
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index b249bdd..6edef29 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClQuantize.h"
 
@@ -32,13 +33,12 @@
 {
 struct CLQuantizationLayer::Impl
 {
-    const ICLTensor                    *src{ nullptr };
-    ICLTensor                          *dst{ nullptr };
-    std::unique_ptr<opencl::ClQuantize> op{ nullptr };
+    const ICLTensor                    *src{nullptr};
+    ICLTensor                          *dst{nullptr};
+    std::unique_ptr<opencl::ClQuantize> op{nullptr};
 };
 
-CLQuantizationLayer::CLQuantizationLayer()
-    : _impl(std::make_unique<Impl>())
+CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLQuantizationLayer::~CLQuantizationLayer() = default;
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 6f12286..34b78ee 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -28,24 +28,37 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_kernel(),
+      _activation(),
+      _fully_connected_kernel(),
+      _copy(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
       _is_prepared(false)
 {
 }
 
 CLRNNLayer::~CLRNNLayer() = default;
 
-Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+Status CLRNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -63,28 +76,42 @@
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+void CLRNNLayer::configure(const ICLTensor     *input,
+                           const ICLTensor     *weights,
+                           const ICLTensor     *recurrent_weights,
+                           const ICLTensor     *bias,
+                           ICLTensor           *hidden_state,
+                           ICLTensor           *output,
                            ActivationLayerInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state,
+              output, info);
 }
 
-void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
-                           ICLTensor *hidden_state,
-                           ICLTensor *output, ActivationLayerInfo &info)
+void CLRNNLayer::configure(const CLCompileContext &compile_context,
+                           const ICLTensor        *input,
+                           const ICLTensor        *weights,
+                           const ICLTensor        *recurrent_weights,
+                           const ICLTensor        *bias,
+                           ICLTensor              *hidden_state,
+                           ICLTensor              *output,
+                           ActivationLayerInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
@@ -133,7 +160,7 @@
 
 void CLRNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected_kernel.prepare();
         _gemm_state_f.prepare();
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 867ef7c..1939d1d 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -24,26 +24,36 @@
 #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
 
 #include "arm_compute/core/CL/ICLArray.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
 #include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
-Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const CLCompileContext    &compile_context,
+                                const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 239a1c6..0d2eab0 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -22,24 +22,35 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
 #include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 using namespace arm_compute;
 
-Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   ITensorInfo               *output,
+                                   const ROIPoolingLayerInfo &pool_info)
 {
     return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  ICLTensor                 *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const CLCompileContext    &compile_context,
+                                  const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  const ICLTensor           *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index 3fbbd5f..5c3f7f9 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLRangeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 
 using namespace arm_compute;
 
@@ -38,7 +38,8 @@
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRange::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
     auto k = std::make_unique<CLRangeKernel>();
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index cddbf77..6c6daff 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -27,23 +27,25 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
 {
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -51,29 +53,29 @@
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -87,8 +89,9 @@
         }
         const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-        const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
-        if(requant)
+        const bool requant =
+            is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+        if (requant)
         {
             TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
             CLDequantizationLayer::validate(input, &input_no_quant);
@@ -98,10 +101,19 @@
     }
     return Status{};
 }
-}
+} // namespace
 
 CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _dequant(),
+      _requant(),
+      _reduction_ops(),
+      _keep_dims(),
+      _do_requant(),
+      _input_no_quant(),
       _output_no_quant()
 {
 }
@@ -111,17 +123,23 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
 }
 
-void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+void CLReduceMean::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             const Coordinates      &reduction_axis,
+                             bool                    keep_dims,
+                             ICLTensor              *output)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
 
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    _do_requant    = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
+    _do_requant = is_data_type_quantized(input->info()->data_type()) &&
+                  input->info()->quantization_info() != output->info()->quantization_info();
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -129,7 +147,7 @@
 
     ICLTensor *tmp_input  = input;
     ICLTensor *tmp_output = output;
-    if(_do_requant)
+    if (_do_requant)
     {
         _memory_group.manage(&_input_no_quant);
         _memory_group.manage(&_output_no_quant);
@@ -148,46 +166,51 @@
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(),
+                                                          tmp_input->info()->data_type(),
+                                                          tmp_input->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
-            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
 
     // Configure reshape layer if we want to drop the dimensions
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         TensorShape out_shape = tmp_input->info()->tensor_shape();
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+        for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
         auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output);
     }
-    if(_do_requant)
+    if (_do_requant)
     {
         _requant.configure(compile_context, &_output_no_quant, output);
         _input_no_quant.allocator()->allocate();
@@ -195,7 +218,10 @@
     }
 }
 
-Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status CLReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
@@ -204,19 +230,19 @@
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_do_requant)
+    if (_do_requant)
     {
         _dequant.run();
     }
-    for(auto &kernel : _reduction_kernels)
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }
-    if(_do_requant)
+    if (_do_requant)
     {
         _requant.run();
     }
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index cdc7fec..ba54890 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -27,35 +27,43 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(std::move(memory_manager)),
+      _unreshaped_output(),
+      _reduction_kernel(),
+      _reshape(),
+      _reduction_axis(),
+      _is_reshape_required(false)
 {
 }
 
 CLReductionOperation::~CLReductionOperation() = default;
 
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status CLReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     const bool is_reshape_required = !keep_dims;
 
-    if(is_reshape_required && output->total_size() != 0)
+    if (is_reshape_required && output->total_size() != 0)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
@@ -67,22 +75,23 @@
     const auto input_qinfo        = input->quantization_info();
     const auto output_data_type   = output->data_type();
 
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         auto shape_before_reshape = input_shape;
         shape_before_reshape.set(axis, 1);
-        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles,
+                              input_qinfo);
         output_internal = &output_before_reshape;
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
     }
@@ -92,7 +101,7 @@
 
 ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
 {
-    if(!_is_reshape_required)
+    if (!_is_reshape_required)
     {
         return output;
     }
@@ -103,12 +112,18 @@
     return &_unreshaped_output;
 }
 
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(
+    ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
 }
 
-void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *output,
+                                     unsigned int            axis,
+                                     ReductionOperation      op,
+                                     bool                    keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
@@ -117,11 +132,17 @@
 
     auto *output_internal = configure_intermediate_result_vector(input, output);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto        output_data_type = input->info()->data_type();
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = input->info()->data_type();
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_tensor_shape(output_shape)
+                                                .set_data_type(output_data_type)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
 
         _memory_group.manage(&_unreshaped_output);
     }
@@ -129,7 +150,7 @@
     _reduction_kernel = std::make_unique<CLReductionOperationKernel>();
     _reduction_kernel->configure(compile_context, input, output_internal, axis, op);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.configure(compile_context, &_unreshaped_output, output);
         _unreshaped_output.allocator()->allocate();
@@ -142,7 +163,7 @@
 
     CLScheduler::get().enqueue(*_reduction_kernel, false);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.run();
     }
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 15de959..156e9b9 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include <utility>
 
@@ -40,7 +40,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *output,
+                             int32_t                 stride)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, stride);
     auto k = std::make_unique<CLReorgLayerKernel>();
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index c51a329..3d6349f 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClReshape.h"
 
@@ -35,17 +36,16 @@
 {
 struct CLReshapeLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClReshape> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClReshape> op{nullptr};
 };
 
-CLReshapeLayer::CLReshapeLayer()
-    : _impl(std::make_unique<Impl>())
+CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
+CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&)            = default;
 CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default;
 CLReshapeLayer::~CLReshapeLayer()                            = default;
 
@@ -78,4 +78,4 @@
     _impl->op->run(pack);
 }
 } // namespace arm_compute
-/** [CLReshapeLayer snippet] **/
\ No newline at end of file
+  /** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 1fc9357..415de52 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLReverseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
 }
 
-void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const CLCompileContext &compile_context,
+                          const ICLTensor        *input,
+                          ICLTensor              *output,
+                          const ICLTensor        *axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis);
     auto k = std::make_unique<CLReverseKernel>();
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 5b78989..abff072 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClScale.h"
 
@@ -33,13 +34,12 @@
 {
 struct CLScale::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClScale> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClScale> op{nullptr};
 };
 
-CLScale::CLScale()
-    : _impl(std::make_unique<Impl>())
+CLScale::CLScale() : _impl(std::make_unique<Impl>())
 {
 }
 CLScale::~CLScale() = default;
@@ -49,7 +49,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
+void CLScale::configure(const CLCompileContext &compile_context,
+                        ICLTensor              *input,
+                        ICLTensor              *output,
+                        const ScaleKernelInfo  &info)
 {
     _impl->src = input;
     _impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index c4ab3dc..b4897d9 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSelectKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
 
 using namespace arm_compute;
 
@@ -38,7 +38,11 @@
     configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
 }
 
-void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelect::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *c,
+                         const ICLTensor        *x,
+                         const ICLTensor        *y,
+                         ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
     auto k = std::make_unique<CLSelectKernel>();
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index 7e3ac7d..f79c6a1 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -26,15 +26,19 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
-void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ITensorInfo      *input,
+                        ITensorInfo            *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
@@ -47,15 +51,16 @@
     _kernel = std::move(k);
 }
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -66,20 +71,22 @@
 
 struct CLSlice::Impl
 {
-    const ICLTensor                       *src{ nullptr };
-    ICLTensor                             *dst{ nullptr };
-    std::unique_ptr<experimental::CLSlice> op{ nullptr };
+    const ICLTensor                       *src{nullptr};
+    ICLTensor                             *dst{nullptr};
+    std::unique_ptr<experimental::CLSlice> op{nullptr};
 };
 
-CLSlice::CLSlice()
-    : _impl(std::make_unique<Impl>())
+CLSlice::CLSlice() : _impl(std::make_unique<Impl>())
 {
 }
-CLSlice::CLSlice(CLSlice &&) = default;
+CLSlice::CLSlice(CLSlice &&)            = default;
 CLSlice &CLSlice::operator=(CLSlice &&) = default;
 CLSlice::~CLSlice()                     = default;
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     return experimental::CLSlice::validate(input, output, starts, ends);
 }
@@ -89,7 +96,11 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
 }
 
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     _impl->src = input;
     _impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d52352f..2e70e2a 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
 #include "src/gpu/cl/operators/ClPermute.h"
@@ -40,9 +42,9 @@
 template <bool IS_LOG>
 struct CLSoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ICLTensor              *src{ nullptr };
-    ICLTensor                    *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *src{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
     ITensorPack                   run_pack{};
     WorkspaceData<CLTensor>       workspace_tensors{};
@@ -65,28 +67,30 @@
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(
+    const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     _impl->src = input;
     _impl->dst = output;
     _impl->op  = std::make_unique<OperatorType>();
 
-    SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis };
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis};
     _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
-    SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis };
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis};
     return OperatorType::validate(*input, *output, softmax_info);
 }
 
 template <bool IS_LOG>
-void           CLSoftmaxLayerGeneric<IS_LOG>::run()
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 3b70834..37f7288 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -29,71 +29,100 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()),
-      _fill(),
-      _has_padding(false)
+    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false)
 {
 }
 
 CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const ICLTensor *block_shape,
+                                    const ICLTensor *paddings,
+                                    ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    const ICLTensor        *paddings,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const int        block_shape_x,
+                                    const int        block_shape_y,
+                                    const Size2D    &padding_left,
+                                    const Size2D    &padding_right,
+                                    ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                    const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const int               block_shape_x,
+                                    const int               block_shape_y,
+                                    const Size2D           &padding_left,
+                                    const Size2D           &padding_right,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right,
+                                      output);
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
     ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -101,7 +130,7 @@
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
         //CLScheduler::get().enqueue(*_fill, true);
         _fill.run();
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index 67dafff..22695c9 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 namespace arm_compute
 {
-CLSpaceToDepthLayer::CLSpaceToDepthLayer()
-    : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
+CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
 {
 }
 
@@ -47,7 +46,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
     _space_to_depth_kernel->configure(compile_context, input, output, block_shape);
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index 0b27371..6be43cc 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
@@ -38,7 +39,7 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 6a335da..c15496f 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <complex>
-
 #include "arm_compute/runtime/CL/functions/CLStackLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -32,16 +30,16 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLStackLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+
+#include <complex>
 
 namespace arm_compute
 {
 CLStackLayer::CLStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _input(), _stack_kernels(), _num_inputs(0)
 {
 }
 
@@ -52,7 +50,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
 }
 
-void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+void CLStackLayer::configure(const CLCompileContext         &compile_context,
+                             const std::vector<ICLTensor *> &input,
+                             int                             axis,
+                             ICLTensor                      *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output);
     _num_inputs = input.size();
@@ -61,7 +62,7 @@
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for (unsigned int i = 0; i < _num_inputs; i++)
     {
         _stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>());
         _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
@@ -79,7 +80,7 @@
 
     const unsigned int num_inputs = input.size();
 
-    for(unsigned int i = 0; i < num_inputs; i++)
+    for (unsigned int i = 0; i < num_inputs; i++)
     {
         // All the tensors must have the same rank
         ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -92,7 +93,7 @@
 
 void CLStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for (unsigned i = 0; i < _num_inputs; i++)
     {
         CLScheduler::get().enqueue(*_stack_kernels[i], false);
     }
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index 261bdc1..c1953cc 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -25,17 +25,23 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *input,
+                               ITensorInfo            *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto k = std::make_unique<CLStridedSliceKernel>();
@@ -43,9 +49,14 @@
     _kernel = std::move(k);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
@@ -53,32 +64,43 @@
 
 struct CLStridedSlice::Impl
 {
-    const ICLTensor                              *src{ nullptr };
-    ICLTensor                                    *dst{ nullptr };
-    CLRuntimeContext                             *ctx{ nullptr };
-    std::unique_ptr<experimental::CLStridedSlice> op{ nullptr };
+    const ICLTensor                              *src{nullptr};
+    ICLTensor                                    *dst{nullptr};
+    CLRuntimeContext                             *ctx{nullptr};
+    std::unique_ptr<experimental::CLStridedSlice> op{nullptr};
 };
 
-CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
 
-CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&)            = default;
 CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
 CLStridedSlice::~CLStridedSlice()                            = default;
 
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const ICLTensor   *input,
+                               ICLTensor         *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask,
+              shrink_axis_mask);
 }
 
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               ICLTensor              *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -86,14 +108,21 @@
     _impl->dst = output;
 
     _impl->op = std::make_unique<experimental::CLStridedSlice>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask,
+                         end_mask, shrink_axis_mask);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
-    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
 }
 
 void CLStridedSlice::run()
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index ef79099..4f86c4a 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 
-#include "src/core/CL/kernels/CLTileKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 
 namespace arm_compute
 {
@@ -34,7 +33,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTile::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       const Multiples        &multiples)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
     auto k = std::make_unique<CLTileKernel>();
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index e63c92e..5a738f4 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClTranspose.h"
 
@@ -34,12 +35,11 @@
 {
 struct CLTranspose::Impl
 {
-    const ICLTensor                     *src{ nullptr };
-    ICLTensor                           *dst{ nullptr };
-    std::unique_ptr<opencl::ClTranspose> op{ nullptr };
+    const ICLTensor                     *src{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::ClTranspose> op{nullptr};
 };
-CLTranspose::CLTranspose()
-    : _impl(std::make_unique<Impl>())
+CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>())
 {
 }
 CLTranspose::~CLTranspose() = default;
@@ -70,4 +70,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 98d4781..ddd83e7 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -40,13 +40,15 @@
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -56,8 +58,7 @@
 } // namespace
 
 CLUnstack::CLUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
@@ -66,15 +67,19 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
 }
 
-void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+void CLUnstack::configure(const CLCompileContext         &compile_context,
+                          const ICLTensor                *input,
+                          const std::vector<ICLTensor *> &output_vector,
+                          int                             axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ICLTensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -87,11 +92,12 @@
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(),
+                                               BiStrides(), 0, slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -106,18 +112,20 @@
     ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void CLUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index b416d0f..645f817 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
@@ -35,15 +36,15 @@
 {
 struct CLWinogradConvolutionLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    const ICLTensor                          *weights{ nullptr };
-    const ICLTensor                          *biases{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr };
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr};
     ITensorPack                               run_pack{};
     MemoryGroup                               memory_group{};
     WorkspaceData<CLTensor>                   workspace_tensors{};
-    bool                                      is_prepared{ false };
+    bool                                      is_prepared{false};
 };
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,15 +55,26 @@
 
 CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
 
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
+void CLWinogradConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLWinogradConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
                                            const PadStrideInfo       &conv_info,
-                                           const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
     _impl->src     = input;
     _impl->weights = weights;
@@ -70,20 +82,25 @@
     _impl->dst     = output;
 
     _impl->op = std::make_unique<opencl::ClWinogradConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info,
+                         enable_fast_math);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, _impl->src },
-        { TensorType::ACL_SRC_1, _impl->weights },
-        { TensorType::ACL_SRC_2, _impl->biases },
-        { TensorType::ACL_DST, _impl->dst }
-    };
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src},
+                       {TensorType::ACL_SRC_1, _impl->weights},
+                       {TensorType::ACL_SRC_2, _impl->biases},
+                       {TensorType::ACL_DST, _impl->dst}};
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
     return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
@@ -97,7 +114,7 @@
 
 void CLWinogradConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -107,4 +124,4 @@
         _impl->is_prepared = true;
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute