Provide logging for configure functions in all gpu operators

Partially Resolves: COMPMID-4718
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: I3d80e732fc957114ec84ef8350dbf12eeae23054
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6301
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Freddie Liardet <frederick.liardet@arm.com>
Reviewed-by: Jakub Jan Sujak <jakub.sujak@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp
index 6b36cc3..74a818d 100644
--- a/src/gpu/cl/operators/ClActivation.cpp
+++ b/src/gpu/cl/operators/ClActivation.cpp
@@ -28,6 +28,7 @@
 
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClContext.h"
 
 namespace arm_compute
@@ -36,6 +37,7 @@
 {
 void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst, act_info);
     auto k = std::make_unique<kernels::ClActivationKernel>();
     k->configure(compile_context, src, dst, act_info);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp
index e1a013a..b9bf505 100644
--- a/src/gpu/cl/operators/ClAdd.cpp
+++ b/src/gpu/cl/operators/ClAdd.cpp
@@ -26,6 +26,8 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -33,6 +35,7 @@
 void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
index 8911d20..05ea21b 100644
--- a/src/gpu/cl/operators/ClCast.cpp
+++ b/src/gpu/cl/operators/ClCast.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCastKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
     auto k = std::make_unique<kernels::ClCastKernel>();
     k->configure(compile_context, src, dst, policy);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp
index 731d9b5..a27fc37 100644
--- a/src/gpu/cl/operators/ClConcatenate.cpp
+++ b/src/gpu/cl/operators/ClConcatenate.cpp
@@ -36,6 +36,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
@@ -45,6 +47,7 @@
 void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(dst == nullptr);
+    ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis);
     _axis       = axis;
     _num_inputs = src_vector.size();
 
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index c91a483..7fe0de7 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -34,6 +34,8 @@
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
+#include "src/common/utils/Log.h"
+
 #include <memory>
 
 namespace
@@ -83,6 +85,7 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
     switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
     {
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
index 61e33f2..08122b6 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
     k->configure(compile_context, src, dst, original_src_shape, data_layout);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp
index c1a9f26..d3b8304 100644
--- a/src/gpu/cl/operators/ClCopy.cpp
+++ b/src/gpu/cl/operators/ClCopy.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCopyKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst, dst_window);
     auto k = std::make_unique<kernels::ClCopyKernel>();
     k->configure(compile_context, src, dst, dst_window);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp
index a6a1c8b..cef9f14 100644
--- a/src/gpu/cl/operators/ClCrop.cpp
+++ b/src/gpu/cl/operators/ClCrop.cpp
@@ -26,6 +26,8 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCropKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -33,6 +35,7 @@
 void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
                        Window *dst_window)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
     auto k = std::make_unique<kernels::ClCropKernel>();
     k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp
index dbaa5f6..0fccab6 100644
--- a/src/gpu/cl/operators/ClDequantize.cpp
+++ b/src/gpu/cl/operators/ClDequantize.cpp
@@ -28,12 +28,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::ClDequantizeKernel>();
     k->configure(compile_context, src, dst);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
index 50e63be..066959f 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp
@@ -29,6 +29,8 @@
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
 #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -48,6 +50,7 @@
                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
 
     // Configure direct convolution kernel
     const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo();
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp
index 4e4cd5a..2525041 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.cpp
+++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp
@@ -25,12 +25,15 @@
 
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp
index 24a603e..270769b 100644
--- a/src/gpu/cl/operators/ClElementwiseUnary.cpp
+++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp
@@ -25,12 +25,15 @@
 
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
     k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp
index 9e006c1..ad22b15 100644
--- a/src/gpu/cl/operators/ClFill.cpp
+++ b/src/gpu/cl/operators/ClFill.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFillKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
 {
+    ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window);
     auto k = std::make_unique<kernels::ClFillKernel>();
     k->configure(compile_context, tensor, constant_value, dst_window);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp
index 3283454..e277c0d 100644
--- a/src/gpu/cl/operators/ClFlatten.cpp
+++ b/src/gpu/cl/operators/ClFlatten.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::ClReshapeKernel>();
     k->configure(compile_context, src, dst);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp
index 866bff2..84f685e 100644
--- a/src/gpu/cl/operators/ClFloor.cpp
+++ b/src/gpu/cl/operators/ClFloor.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFloorKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::ClFloorKernel>();
     k->configure(compile_context, src, dst);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
index 8b7e336..165ffe9 100644
--- a/src/gpu/cl/operators/ClFullyConnected.cpp
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -38,6 +38,7 @@
 #include "src/gpu/cl/operators/ClTranspose.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 
+#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 #include <algorithm>
@@ -231,6 +232,7 @@
 
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
 
     _are_weights_converted = true;
     _are_weights_reshaped  = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 625c057..e955ae3 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -38,7 +38,6 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
-#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -47,6 +46,7 @@
 #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
 
+#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 #include "utils/TypePrinter.h"
 
@@ -561,6 +561,7 @@
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));
+    ARM_COMPUTE_LOG_PARAMS(a, b, c, output, alpha, beta, gemm_info);
 
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 0f625bc..785f1f1 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -41,6 +41,8 @@
 #include "src/gpu/cl/operators/ClGemm.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+
+#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -159,6 +161,7 @@
     ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst,
                                                       conv2d_info,
                                                       weights_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
     const DataType   data_type   = src->data_type();
     const DataLayout data_layout = src->data_layout();
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index f3c0ee1..6fd7e52 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -47,6 +47,7 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
 
+#include "src/common/utils/Log.h"
 #include "utils/TypePrinter.h"
 
 namespace arm_compute
@@ -218,6 +219,7 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info));
+    ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
 
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->quantization_info().uniform().offset;
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
index 27fb892..a61b11a 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
@@ -31,6 +31,8 @@
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -38,6 +40,7 @@
 void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
 
     switch(info.type)
     {
diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp
index b909066..b2eb89b 100644
--- a/src/gpu/cl/operators/ClLogicalNot.cpp
+++ b/src/gpu/cl/operators/ClLogicalNot.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
     k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp
index 59d2b96..2066f0c 100644
--- a/src/gpu/cl/operators/ClMul.cpp
+++ b/src/gpu/cl/operators/ClMul.cpp
@@ -27,6 +27,8 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClMulKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -34,6 +36,7 @@
 void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
                       ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
     auto k = std::make_unique<kernels::ClMulKernel>();
     k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp
index 05717d5..cf4ebe6 100644
--- a/src/gpu/cl/operators/ClPRelu.cpp
+++ b/src/gpu/cl/operators/ClPRelu.cpp
@@ -22,8 +22,11 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/operators/ClPRelu.h"
+
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -31,6 +34,7 @@
 using KernelType = kernels::ClArithmeticKernel;
 void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, alpha, output);
     auto k = std::make_unique<KernelType>();
     k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp
index ed74e22..ed56f97 100644
--- a/src/gpu/cl/operators/ClPermute.cpp
+++ b/src/gpu/cl/operators/ClPermute.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPermuteKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst, perm);
     auto k = std::make_unique<kernels::ClPermuteKernel>();
     k->configure(compile_context, src, dst, perm);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
index a5b18a2..3da90b8 100644
--- a/src/gpu/cl/operators/ClPool2d.cpp
+++ b/src/gpu/cl/operators/ClPool2d.cpp
@@ -28,6 +28,8 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool2dKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -35,6 +37,8 @@
 void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices);
+
     // Configure pooling kernel
     auto k = std::make_unique<kernels::ClPool2dKernel>();
     k->set_target(CLScheduler::get().target());
diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp
index 915e0fd..47ae5ce 100644
--- a/src/gpu/cl/operators/ClQuantize.cpp
+++ b/src/gpu/cl/operators/ClQuantize.cpp
@@ -28,12 +28,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::ClQuantizeKernel>();
     k->configure(compile_context, src, dst);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp
index 2c1d181..560966f 100644
--- a/src/gpu/cl/operators/ClReshape.cpp
+++ b/src/gpu/cl/operators/ClReshape.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::ClReshapeKernel>();
     k->configure(compile_context, src, dst);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp
index 6dab667..0798b19 100644
--- a/src/gpu/cl/operators/ClScale.cpp
+++ b/src/gpu/cl/operators/ClScale.cpp
@@ -29,6 +29,8 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClScaleKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -36,6 +38,8 @@
 void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
+
     // Configure Scale kernel
     auto k = std::make_unique<kernels::ClScaleKernel>();
     k->set_target(CLScheduler::get().target());
diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
index 6b728f5..0380955 100644
--- a/src/gpu/cl/operators/ClSoftmax.cpp
+++ b/src/gpu/cl/operators/ClSoftmax.cpp
@@ -30,6 +30,8 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "support/Cast.h"
 
+#include "src/common/utils/Log.h"
+
 using namespace arm_compute::experimental;
 
 namespace arm_compute
@@ -53,6 +55,7 @@
 void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
 
     const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
 
diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp
index b94fef3..53be04a 100644
--- a/src/gpu/cl/operators/ClSub.cpp
+++ b/src/gpu/cl/operators/ClSub.cpp
@@ -26,6 +26,8 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -33,6 +35,7 @@
 void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
     k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp
index 6429451..26feffe 100644
--- a/src/gpu/cl/operators/ClTranspose.cpp
+++ b/src/gpu/cl/operators/ClTranspose.cpp
@@ -26,12 +26,15 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace opencl
 {
 void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::ClTransposeKernel>();
     k->configure(compile_context, src, dst);
     _kernel = std::move(k);
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
index fbf6442..ffa1eff 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -36,6 +36,8 @@
 #include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+
+#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 using namespace arm_compute::experimental;
@@ -173,6 +175,8 @@
                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math);
+
     // Get indices for the width and height
     const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);