IVGCVSW-6440 Add profiling around CL workload configure calls

Signed-off-by: Kevin May <kevin.may@arm.com>
Change-Id: I7626d5bd82e832d5be6913719a34d76fbd1dbed8
diff --git a/src/backends/cl/workloads/ClAbsWorkload.cpp b/src/backends/cl/workloads/ClAbsWorkload.cpp
index fa8e4f7..eeaec54 100644
--- a/src/backends/cl/workloads/ClAbsWorkload.cpp
+++ b/src/backends/cl/workloads/ClAbsWorkload.cpp
@@ -33,8 +33,10 @@
 
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
-
-    m_AbsLayer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClAbsWorkload_configure");
+        m_AbsLayer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClAbsWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClActivationWorkload.cpp b/src/backends/cl/workloads/ClActivationWorkload.cpp
index 20a65b6..229a291 100644
--- a/src/backends/cl/workloads/ClActivationWorkload.cpp
+++ b/src/backends/cl/workloads/ClActivationWorkload.cpp
@@ -47,7 +47,10 @@
 
     arm_compute::ICLTensor& input = static_cast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
-    m_ActivationLayer.configure(clCompileContext, &input, &output, activationLayerInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClActivationWorkload_configure");
+        m_ActivationLayer.configure(clCompileContext, &input, &output, activationLayerInfo);
+    }
 }
 
 void ClActivationWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClAdditionWorkload.cpp b/src/backends/cl/workloads/ClAdditionWorkload.cpp
index 9bef060..55957d7 100644
--- a/src/backends/cl/workloads/ClAdditionWorkload.cpp
+++ b/src/backends/cl/workloads/ClAdditionWorkload.cpp
@@ -30,8 +30,10 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
-
-    m_Layer.configure(clCompileContext, &input0, &input1, &output, g_AclConvertPolicy, activationInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClAdditionWorkload_configure");
+        m_Layer.configure(clCompileContext, &input0, &input1, &output, g_AclConvertPolicy, activationInfo);
+    }
 }
 
 void ClAdditionWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClArgMinMaxWorkload.cpp b/src/backends/cl/workloads/ClArgMinMaxWorkload.cpp
index 78646a7..0bfb4e2 100644
--- a/src/backends/cl/workloads/ClArgMinMaxWorkload.cpp
+++ b/src/backends/cl/workloads/ClArgMinMaxWorkload.cpp
@@ -70,17 +70,20 @@
     auto unsignedAxis = armnnUtils::GetUnsignedAxis(numDims, m_Data.m_Parameters.m_Axis);
     int aclAxis = armnn::numeric_cast<int>(CalcAclAxis(numDims, unsignedAxis));
 
-    if (m_Data.m_Parameters.m_Function == ArgMinMaxFunction::Max)
     {
-        m_ArgMinMaxLayer.configure(&input, aclAxis, &output, arm_compute::ReductionOperation::ARG_IDX_MAX);
-    }
-    else
-    {
-        m_ArgMinMaxLayer.configure(clCompileContext,
-                                   &input,
-                                   aclAxis,
-                                   &output,
-                                   arm_compute::ReductionOperation::ARG_IDX_MIN);
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClArgMinMaxWorkload_configure");
+        if (m_Data.m_Parameters.m_Function == ArgMinMaxFunction::Max)
+        {
+            m_ArgMinMaxLayer.configure(&input, aclAxis, &output, arm_compute::ReductionOperation::ARG_IDX_MAX);
+        }
+        else
+        {
+            m_ArgMinMaxLayer.configure(clCompileContext,
+                                       &input,
+                                       aclAxis,
+                                       &output,
+                                       arm_compute::ReductionOperation::ARG_IDX_MIN);
+        }
     }
 }
 
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
index 8367d7e..fba1679 100644
--- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
@@ -86,15 +86,18 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    m_Layer.configure(clCompileContext,
-                      &input,
-                      &output,
-                      m_Mean.get(),
-                      m_Variance.get(),
-                      m_Beta.get(),
-                      m_Gamma.get(),
-                      m_Data.m_Parameters.m_Eps,
-                      activationInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClBatchNormalizationFloatWorkload_configure");
+        m_Layer.configure(clCompileContext,
+                          &input,
+                          &output,
+                          m_Mean.get(),
+                          m_Variance.get(),
+                          m_Beta.get(),
+                          m_Gamma.get(),
+                          m_Data.m_Parameters.m_Eps,
+                          activationInfo);
+    }
 
     InitializeArmComputeClTensorData(*m_Mean, m_Data.m_Mean);
     InitializeArmComputeClTensorData(*m_Variance, m_Data.m_Variance);
diff --git a/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.cpp b/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.cpp
index 8eef587..28b408d 100644
--- a/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.cpp
+++ b/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.cpp
@@ -42,7 +42,10 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     output.info()->set_data_layout(aclDataLayout);
 
-    m_Layer.configure(clCompileContext, &input, blockWidth, blockHeight, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClBatchToSpaceNdWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, blockWidth, blockHeight, &output);
+    }
 }
 
 void ClBatchToSpaceNdWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClCastWorkload.cpp b/src/backends/cl/workloads/ClCastWorkload.cpp
index 07b76dc..9606385 100644
--- a/src/backends/cl/workloads/ClCastWorkload.cpp
+++ b/src/backends/cl/workloads/ClCastWorkload.cpp
@@ -35,7 +35,10 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_CastLayer.configure(clCompileContext, &input, &output, g_AclConvertPolicy);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClCastWorkload_configure");
+        m_CastLayer.configure(clCompileContext, &input, &output, g_AclConvertPolicy);
+    }
 }
 
 void ClCastWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClChannelShuffleWorkload.cpp b/src/backends/cl/workloads/ClChannelShuffleWorkload.cpp
index 751056a..5d3e66c 100644
--- a/src/backends/cl/workloads/ClChannelShuffleWorkload.cpp
+++ b/src/backends/cl/workloads/ClChannelShuffleWorkload.cpp
@@ -86,7 +86,10 @@
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    m_ChannelShuffleLayer.configure(clCompileContext, &input, &output, descriptor.m_Parameters.m_NumGroups);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClChannelShuffleWorkload_configure");
+        m_ChannelShuffleLayer.configure(clCompileContext, &input, &output, descriptor.m_Parameters.m_NumGroups);
+    }
 }
 
 void ClChannelShuffleWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClComparisonWorkload.cpp b/src/backends/cl/workloads/ClComparisonWorkload.cpp
index d83682d..3d59e08 100644
--- a/src/backends/cl/workloads/ClComparisonWorkload.cpp
+++ b/src/backends/cl/workloads/ClComparisonWorkload.cpp
@@ -58,7 +58,10 @@
 
     const arm_compute::ComparisonOperation comparisonOperation = ConvertComparisonOperationToAcl(m_Data.m_Parameters);
 
-    m_ComparisonLayer.configure(clCompileContext, &input0, &input1, &output, comparisonOperation);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClComparisonWorkload_configure");
+        m_ComparisonLayer.configure(clCompileContext, &input0, &input1, &output, comparisonOperation);
+    }
 }
 
 void ClComparisonWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClConcatWorkload.cpp b/src/backends/cl/workloads/ClConcatWorkload.cpp
index 233fd19..58983c8 100644
--- a/src/backends/cl/workloads/ClConcatWorkload.cpp
+++ b/src/backends/cl/workloads/ClConcatWorkload.cpp
@@ -88,9 +88,12 @@
     // Create the layer function
     auto layer = std::make_unique<arm_compute::CLConcatenateLayer>();
 
-    // Configure input and output tensors
-    size_t aclAxis = CalcAxis(descriptor.m_Parameters);
-    layer->configure(clCompileContext, aclInputs, &output, aclAxis);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConcatWorkload_configure");
+        // Configure input and output tensors
+        size_t aclAxis = CalcAxis(descriptor.m_Parameters);
+        layer->configure(clCompileContext, aclInputs, &output, aclAxis);
+    }
 
     // Prepare
     layer->prepare();
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
index 455ec1a..ccea7c8 100644
--- a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
@@ -25,7 +25,10 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvertFp16ToFp32Workload_configure");
+        m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
+    }
 }
 
 void ClConvertFp16ToFp32Workload::Execute() const
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
index 8e6b0ce..9b38b22 100644
--- a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
@@ -25,7 +25,10 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvertFp32ToFp16Workload_configure");
+        m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
+    }
 }
 
 void ClConvertFp32ToFp16Workload::Execute() const
diff --git a/src/backends/cl/workloads/ClConvolution3dWorkload.cpp b/src/backends/cl/workloads/ClConvolution3dWorkload.cpp
index 18a2c31..baa2f05 100644
--- a/src/backends/cl/workloads/ClConvolution3dWorkload.cpp
+++ b/src/backends/cl/workloads/ClConvolution3dWorkload.cpp
@@ -83,13 +83,15 @@
     const arm_compute::Conv3dInfo aclConv3DInfo = ComputeConv3DInfo(descriptor,
                                                                     isFastMathEnabled);
 
-    m_ConvolutionLayer.configure(clCompileContext,
-                                 &input,
-                                 &weights,
-                                 biasesPtr,
-                                 &output,
-                                 aclConv3DInfo);
-
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution3dWorkload_configure");
+        m_ConvolutionLayer.configure(clCompileContext,
+                                     &input,
+                                     &weights,
+                                     biasesPtr,
+                                     &output,
+                                     aclConv3DInfo);
+    }
      // Add details for profiling output
     WorkloadInfo detailsInfo;
 
diff --git a/src/backends/cl/workloads/ClDepthToSpaceWorkload.cpp b/src/backends/cl/workloads/ClDepthToSpaceWorkload.cpp
index aeab029..75a87c7 100644
--- a/src/backends/cl/workloads/ClDepthToSpaceWorkload.cpp
+++ b/src/backends/cl/workloads/ClDepthToSpaceWorkload.cpp
@@ -61,7 +61,10 @@
         PolymorphicPointerDowncast<IClTensorHandle>(m_Data.m_Outputs[0])->GetTensor();
     output.info()->set_data_layout(aclDataLayout);
 
-    m_Layer.configure(clCompileContext, &input, &output, blockSize);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClDepthToSpaceWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, &output, blockSize);
+    }
 }
 
 void ClDepthToSpaceWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
index 9592b37..91c0018 100644
--- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
@@ -135,17 +135,20 @@
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
     m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
-    static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_DepthwiseConvolutionLayer.get())->configure(
-        clCompileContext,
-        &input,
-        m_KernelTensor.get(),
-        m_BiasTensor.get(),
-        &output,
-        padStrideInfo,
-        depthMultiplier,
-        activationInfo,
-        aclDilationInfo);
 
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClDepthwiseConvolutionWorkload_configure");
+        static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_DepthwiseConvolutionLayer.get())->configure(
+                clCompileContext,
+                &input,
+                m_KernelTensor.get(),
+                m_BiasTensor.get(),
+                &output,
+                padStrideInfo,
+                depthMultiplier,
+                activationInfo,
+                aclDilationInfo);
+    }
     ARMNN_ASSERT(m_DepthwiseConvolutionLayer);
 
     ScopedTensorHandle weightsPermutedHandle(weightPermuted);
diff --git a/src/backends/cl/workloads/ClDequantizeWorkload.cpp b/src/backends/cl/workloads/ClDequantizeWorkload.cpp
index 6bdeaa8..00d849c 100644
--- a/src/backends/cl/workloads/ClDequantizeWorkload.cpp
+++ b/src/backends/cl/workloads/ClDequantizeWorkload.cpp
@@ -41,7 +41,10 @@
             m_Data.m_Outputs[0])->GetTensor();
 
     m_Layer.reset(new arm_compute::CLDequantizationLayer());
-    m_Layer->configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClDequantizeWorkload_configure");
+        m_Layer->configure(clCompileContext, &input, &output);
+    }
     m_Layer->prepare();
 }
 
diff --git a/src/backends/cl/workloads/ClDivisionWorkload.cpp b/src/backends/cl/workloads/ClDivisionWorkload.cpp
index d444a19..5df4c61 100644
--- a/src/backends/cl/workloads/ClDivisionWorkload.cpp
+++ b/src/backends/cl/workloads/ClDivisionWorkload.cpp
@@ -44,7 +44,10 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    m_ArithmeticDivision.configure(clCompileContext, &input0, &input1, &output, activationInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClDivisionWorkload_configure");
+        m_ArithmeticDivision.configure(clCompileContext, &input0, &input1, &output, activationInfo);
+    }
 }
 
 void ClDivisionWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClExpWorkload.cpp b/src/backends/cl/workloads/ClExpWorkload.cpp
index 9c1f036..eeb6637 100644
--- a/src/backends/cl/workloads/ClExpWorkload.cpp
+++ b/src/backends/cl/workloads/ClExpWorkload.cpp
@@ -39,7 +39,10 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_ExpLayer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClExpWorkload_configure");
+        m_ExpLayer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClExpWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClFillWorkload.cpp b/src/backends/cl/workloads/ClFillWorkload.cpp
index ea42dcf..2f95bc5 100644
--- a/src/backends/cl/workloads/ClFillWorkload.cpp
+++ b/src/backends/cl/workloads/ClFillWorkload.cpp
@@ -31,7 +31,10 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
     arm_compute::PixelValue pixelValue = GetPixelValue(output.info(), descriptor.m_Parameters.m_Value);
 
-    m_Layer.configure(clCompileContext, &output, pixelValue);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClFillWorkload_configure");
+        m_Layer.configure(clCompileContext, &output, pixelValue);
+    }
 }
 
 void ClFillWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
index d2b4871..5db8cc6 100644
--- a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
@@ -30,7 +30,10 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClFloorFloatWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClFloorFloatWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp b/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp
index a0889e1..cc4ce90 100644
--- a/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp
+++ b/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp
@@ -88,12 +88,15 @@
     arm_compute::FullyConnectedLayerInfo fc_info =
         ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor.m_Parameters, activationInfo);
 
-    m_FullyConnectedLayer.configure(clCompileContext,
-                                    &input,
-                                    m_WeightsTensor.get(),
-                                    m_BiasesTensor.get(),
-                                    &output,
-                                    fc_info);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClFullyConnectedWorkload_configure");
+        m_FullyConnectedLayer.configure(clCompileContext,
+                                        &input,
+                                        m_WeightsTensor.get(),
+                                        m_BiasesTensor.get(),
+                                        &output,
+                                        fc_info);
+    }
 
     InitializeArmComputeClTensorData(*m_WeightsTensor, m_Data.m_Weight);
 
diff --git a/src/backends/cl/workloads/ClGatherWorkload.cpp b/src/backends/cl/workloads/ClGatherWorkload.cpp
index 7c8d1ab..b2341b8 100644
--- a/src/backends/cl/workloads/ClGatherWorkload.cpp
+++ b/src/backends/cl/workloads/ClGatherWorkload.cpp
@@ -45,7 +45,10 @@
 
     int aclAxis = ComputeAclAxis(descriptor.m_Parameters.m_Axis, info.m_InputTensorInfos[0]);
 
-    m_Layer.configure(clCompileContext, &input, &indices, &output, aclAxis);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClGatherWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, &indices, &output, aclAxis);
+    }
 };
 
 void ClGatherWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClInstanceNormalizationWorkload.cpp b/src/backends/cl/workloads/ClInstanceNormalizationWorkload.cpp
index a4f20c5..58e65dd 100644
--- a/src/backends/cl/workloads/ClInstanceNormalizationWorkload.cpp
+++ b/src/backends/cl/workloads/ClInstanceNormalizationWorkload.cpp
@@ -50,12 +50,15 @@
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    m_Layer.configure(clCompileContext,
-                      &input,
-                      &output,
-                      descriptor.m_Parameters.m_Gamma,
-                      descriptor.m_Parameters.m_Beta,
-                      descriptor.m_Parameters.m_Eps);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClInstanceNormalizationWorkload_configure");
+        m_Layer.configure(clCompileContext,
+                          &input,
+                          &output,
+                          descriptor.m_Parameters.m_Gamma,
+                          descriptor.m_Parameters.m_Beta,
+                          descriptor.m_Parameters.m_Eps);
+    }
 };
 
 void ClInstanceNormalizationWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
index 953ff4a..3b20ace 100644
--- a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
@@ -48,7 +48,10 @@
 
     int axis = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 2 : 0;
 
-    m_Layer.configure(clCompileContext, &input, &output, axis, m_Data.m_Parameters.m_Eps);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClL2NormalizationFloatWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, &output, axis, m_Data.m_Parameters.m_Eps);
+    }
 }
 
 void ClL2NormalizationFloatWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogSoftmaxWorkload.cpp b/src/backends/cl/workloads/ClLogSoftmaxWorkload.cpp
index 6c03211..b75c6b0 100644
--- a/src/backends/cl/workloads/ClLogSoftmaxWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogSoftmaxWorkload.cpp
@@ -44,7 +44,11 @@
     arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
     int aclAxis = ComputeAclAxis(m_Data.m_Parameters.m_Axis, info.m_InputTensorInfos[0]);
-    m_LogSoftmaxLayer.configure(clCompileContext, &input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
+
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClLogSoftmaxWorkload_configure");
+        m_LogSoftmaxLayer.configure(clCompileContext, &input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
+    }
 }
 
 void ClLogSoftmaxWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogWorkload.cpp b/src/backends/cl/workloads/ClLogWorkload.cpp
index 180c0af..d13a0ea 100644
--- a/src/backends/cl/workloads/ClLogWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogWorkload.cpp
@@ -33,7 +33,10 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_LogLayer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClLogWorkload_configure");
+        m_LogLayer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClLogWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogicalAndWorkload.cpp b/src/backends/cl/workloads/ClLogicalAndWorkload.cpp
index 30a187b..481d87c 100644
--- a/src/backends/cl/workloads/ClLogicalAndWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogicalAndWorkload.cpp
@@ -48,7 +48,10 @@
     arm_compute::ICLTensor& input1 = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_LogicalAndLayer.configure(clCompileContext, &input0, &input1, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClLogicalAndWorkload_configure");
+        m_LogicalAndLayer.configure(clCompileContext, &input0, &input1, &output);
+    }
 }
 
 void ClLogicalAndWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogicalNotWorkload.cpp b/src/backends/cl/workloads/ClLogicalNotWorkload.cpp
index 4e95fcd..c61f844 100644
--- a/src/backends/cl/workloads/ClLogicalNotWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogicalNotWorkload.cpp
@@ -44,7 +44,10 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_LogicalNotLayer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClLogicalNotWorkload_configure");
+        m_LogicalNotLayer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClLogicalNotWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogicalOrWorkload.cpp b/src/backends/cl/workloads/ClLogicalOrWorkload.cpp
index b4eb11c..307af20 100644
--- a/src/backends/cl/workloads/ClLogicalOrWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogicalOrWorkload.cpp
@@ -48,7 +48,10 @@
     arm_compute::ICLTensor& input1 = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_LogicalOrLayer.configure(clCompileContext, &input0, &input1, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClLogicalOrWorkload_configure");
+        m_LogicalOrLayer.configure(clCompileContext, &input0, &input1, &output);
+    }
 }
 
 void ClLogicalOrWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
index 709b145..9cbbff3 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
@@ -193,14 +193,17 @@
         throw armnn::Exception("Wrong Type of Activation Function!");
     }
 
-    m_LstmLayer.configure(clCompileContext, &input, m_InputToForgetWeightsTensor.get(),
-                          m_InputToCellWeightsTensor.get(), m_InputToOutputWeightsTensor.get(),
-                          m_RecurrentToForgetWeightsTensor.get(), m_RecurrentToCellWeightsTensor.get(),
-                          m_RecurrentToOutputWeightsTensor.get(), m_ForgetGateBiasTensor.get(),
-                          m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(), &output_state_in,
-                          &cell_state_in, m_ScratchBuffer.get(), &output_state_out,
-                          &cell_state_out, &output, lstm_param, activationLayerInfo,
-                          cell_threshold, projection_threshold);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClLstmFloatWorkload_configure");
+        m_LstmLayer.configure(clCompileContext, &input, m_InputToForgetWeightsTensor.get(),
+                              m_InputToCellWeightsTensor.get(), m_InputToOutputWeightsTensor.get(),
+                              m_RecurrentToForgetWeightsTensor.get(), m_RecurrentToCellWeightsTensor.get(),
+                              m_RecurrentToOutputWeightsTensor.get(), m_ForgetGateBiasTensor.get(),
+                              m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(), &output_state_in,
+                              &cell_state_in, m_ScratchBuffer.get(), &output_state_out,
+                              &cell_state_out, &output, lstm_param, activationLayerInfo,
+                              cell_threshold, projection_threshold);
+    }
 
     armcomputetensorutils::InitialiseArmComputeTensorEmpty(*m_ScratchBuffer);
 
diff --git a/src/backends/cl/workloads/ClMaximumWorkload.cpp b/src/backends/cl/workloads/ClMaximumWorkload.cpp
index 5a19c69..f10c609 100644
--- a/src/backends/cl/workloads/ClMaximumWorkload.cpp
+++ b/src/backends/cl/workloads/ClMaximumWorkload.cpp
@@ -47,7 +47,10 @@
     arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_MaximumLayer.configure(clCompileContext, &input0, &input1, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClMaximumWorkload_configure");
+        m_MaximumLayer.configure(clCompileContext, &input0, &input1, &output);
+    }
 }
 
 void ClMaximumWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClMeanWorkload.cpp b/src/backends/cl/workloads/ClMeanWorkload.cpp
index cd79d04..074b4b2 100644
--- a/src/backends/cl/workloads/ClMeanWorkload.cpp
+++ b/src/backends/cl/workloads/ClMeanWorkload.cpp
@@ -47,7 +47,10 @@
                                                                           info.m_InputTensorInfos[0].GetNumDimensions(),
                                                                           m_Data.m_Parameters.m_Axis);
 
-    m_Layer.configure(clCompileContext, &input, coords, m_Data.m_Parameters.m_KeepDims, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClMeanWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, coords, m_Data.m_Parameters.m_KeepDims, &output);
+    }
 }
 
 void ClMeanWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClMinimumWorkload.cpp b/src/backends/cl/workloads/ClMinimumWorkload.cpp
index 22e9287..d29dcc2 100644
--- a/src/backends/cl/workloads/ClMinimumWorkload.cpp
+++ b/src/backends/cl/workloads/ClMinimumWorkload.cpp
@@ -47,7 +47,10 @@
     arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_MinimumLayer.configure(clCompileContext, &input0, &input1, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClMinimumWorkload_configure");
+        m_MinimumLayer.configure(clCompileContext, &input0, &input1, &output);
+    }
 }
 
 void ClMinimumWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClMultiplicationWorkload.cpp b/src/backends/cl/workloads/ClMultiplicationWorkload.cpp
index b0b71ce..e19a7a2 100644
--- a/src/backends/cl/workloads/ClMultiplicationWorkload.cpp
+++ b/src/backends/cl/workloads/ClMultiplicationWorkload.cpp
@@ -62,15 +62,18 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    // Construct
-    m_PixelWiseMultiplication.configure(clCompileContext,
-                                        &input0,
-                                        &input1,
-                                        &output,
-                                        1.0f,
-                                        convertPolicy,
-                                        arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
-                                        activationInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClMultiplicationWorkload_configure");
+        // Construct
+        m_PixelWiseMultiplication.configure(clCompileContext,
+                                            &input0,
+                                            &input1,
+                                            &output,
+                                            1.0f,
+                                            convertPolicy,
+                                            arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
+                                            activationInfo);
+    }
 }
 
 void ClMultiplicationWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClNegWorkload.cpp b/src/backends/cl/workloads/ClNegWorkload.cpp
index fb5b040..c606189 100644
--- a/src/backends/cl/workloads/ClNegWorkload.cpp
+++ b/src/backends/cl/workloads/ClNegWorkload.cpp
@@ -33,7 +33,10 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_NegLayer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClNegWorkload_configure");
+        m_NegLayer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClNegWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
index 9c6e0a1..9234a8a 100644
--- a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
@@ -50,7 +50,10 @@
 
     arm_compute::NormalizationLayerInfo normalizationInfo = BuildArmComputeNormalizationLayerInfo(m_Data.m_Parameters);
 
-    m_NormalizationLayer.configure(clCompileContext, &input, &output, normalizationInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClNormalizationFloatWorkload_configure");
+        m_NormalizationLayer.configure(clCompileContext, &input, &output, normalizationInfo);
+    }
 };
 
 void ClNormalizationFloatWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClPadWorkload.cpp b/src/backends/cl/workloads/ClPadWorkload.cpp
index 4697510..48d61b0 100644
--- a/src/backends/cl/workloads/ClPadWorkload.cpp
+++ b/src/backends/cl/workloads/ClPadWorkload.cpp
@@ -41,7 +41,10 @@
 
     arm_compute::PixelValue pixelValue = GetPixelValue(input.info(), descriptor.m_Parameters.m_PadValue);
 
-    m_Layer.configure(clCompileContext, &input, &output, padList, pixelValue);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClPadWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, &output, padList, pixelValue);
+    }
 }
 
 void ClPadWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClPermuteWorkload.cpp b/src/backends/cl/workloads/ClPermuteWorkload.cpp
index c7efe7a..641e871 100644
--- a/src/backends/cl/workloads/ClPermuteWorkload.cpp
+++ b/src/backends/cl/workloads/ClPermuteWorkload.cpp
@@ -45,8 +45,11 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
 
-    // Run the layer.
-    m_PermuteFunction.configure(clCompileContext, &input, &output, BuildArmComputePermutationVector(mappings));
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClPermuteWorkload_configure");
+        // Run the layer.
+        m_PermuteFunction.configure(clCompileContext, &input, &output, BuildArmComputePermutationVector(mappings));
+    }
 }
 
 void ClPermuteWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClPooling2dWorkload.cpp b/src/backends/cl/workloads/ClPooling2dWorkload.cpp
index ff441ef..f967c6d 100644
--- a/src/backends/cl/workloads/ClPooling2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClPooling2dWorkload.cpp
@@ -55,8 +55,11 @@
 
     arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters, fpMixedPrecision);
 
-    // Run the layer.
-    m_PoolingLayer.configure(clCompileContext, &input, &output, layerInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClPooling2dWorkload_configure");
+        // Run the layer.
+        m_PoolingLayer.configure(clCompileContext, &input, &output, layerInfo);
+    }
 }
 
 void ClPooling2dWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClPreluWorkload.cpp b/src/backends/cl/workloads/ClPreluWorkload.cpp
index beb9e43..449e4de 100644
--- a/src/backends/cl/workloads/ClPreluWorkload.cpp
+++ b/src/backends/cl/workloads/ClPreluWorkload.cpp
@@ -37,7 +37,10 @@
     arm_compute::ICLTensor& alpha = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_PreluLayer.configure(clCompileContext, &input, &alpha, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClPreluWorkload_configure");
+        m_PreluLayer.configure(clCompileContext, &input, &alpha, &output);
+    }
 }
 
 void ClPreluWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClQLstmWorkload.cpp b/src/backends/cl/workloads/ClQLstmWorkload.cpp
index d7c7af7..b2c1d6d 100644
--- a/src/backends/cl/workloads/ClQLstmWorkload.cpp
+++ b/src/backends/cl/workloads/ClQLstmWorkload.cpp
@@ -158,24 +158,27 @@
                                         m_Data.m_Parameters.m_CellIntermediateScale,
                                         m_Data.m_Parameters.m_OutputIntermediateScale);
 
-    // QLSTM CL configure
-    m_QLstmLayer.configure(clCompileContext,
-                           &input,
-                           m_InputToForgetWeightsTensor.get(),
-                           m_InputToCellWeightsTensor.get(),
-                           m_InputToOutputWeightsTensor.get(),
-                           m_RecurrentToForgetWeightsTensor.get(),
-                           m_RecurrentToCellWeightsTensor.get(),
-                           m_RecurrentToOutputWeightsTensor.get(),
-                           m_ForgetGateBiasTensor.get(),
-                           m_CellBiasTensor.get(),
-                           m_OutputGateBiasTensor.get(),
-                           &cellStateIn,
-                           &outputStateIn,
-                           &cellStateOut,
-                           &outputStateOut,
-                           &output,
-                           qLstmParams);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClQLstmWorkload_configure");
+        // QLSTM CL configure
+        m_QLstmLayer.configure(clCompileContext,
+                               &input,
+                               m_InputToForgetWeightsTensor.get(),
+                               m_InputToCellWeightsTensor.get(),
+                               m_InputToOutputWeightsTensor.get(),
+                               m_RecurrentToForgetWeightsTensor.get(),
+                               m_RecurrentToCellWeightsTensor.get(),
+                               m_RecurrentToOutputWeightsTensor.get(),
+                               m_ForgetGateBiasTensor.get(),
+                               m_CellBiasTensor.get(),
+                               m_OutputGateBiasTensor.get(),
+                               &cellStateIn,
+                               &outputStateIn,
+                               &cellStateOut,
+                               &outputStateOut,
+                               &output,
+                               qLstmParams);
+    }
 
     // Initialise ACL tensor data for mandatory params
     InitializeArmComputeClTensorData(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights);
diff --git a/src/backends/cl/workloads/ClQuantizeWorkload.cpp b/src/backends/cl/workloads/ClQuantizeWorkload.cpp
index dc668fd..5321e62 100644
--- a/src/backends/cl/workloads/ClQuantizeWorkload.cpp
+++ b/src/backends/cl/workloads/ClQuantizeWorkload.cpp
@@ -39,7 +39,10 @@
     arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClQuantizeWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClQuantizeWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClQuantizedLstmWorkload.cpp b/src/backends/cl/workloads/ClQuantizedLstmWorkload.cpp
index 7bacf70..05ae89c 100644
--- a/src/backends/cl/workloads/ClQuantizedLstmWorkload.cpp
+++ b/src/backends/cl/workloads/ClQuantizedLstmWorkload.cpp
@@ -109,14 +109,18 @@
     arm_compute::ICLTensor& cellStateOutTensor        = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     arm_compute::ICLTensor& outputStateOutTensor      = static_cast<IClTensorHandle*>(m_Data.m_Outputs[1])->GetTensor();
 
-    m_QuantizedLstmLayer.configure(clCompileContext, &inputTensor, m_InputToInputWeightsTensor.get(),
-                                   m_InputToForgetWeightsTensor.get(),
-                                   m_InputToCellWeightsTensor.get(), m_InputToOutputWeightsTensor.get(),
-                                   m_RecurrentToInputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(),
-                                   m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(),
-                                   m_InputGateBiasTensor.get(), m_ForgetGateBiasTensor.get(), m_CellBiasTensor.get(),
-                                   m_OutputGateBiasTensor.get(), &cellStateInTensor, &outputStateInTensor,
-                                   &cellStateOutTensor, &outputStateOutTensor);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClQuantizedLstmWorkload_configure");
+        m_QuantizedLstmLayer.configure(clCompileContext, &inputTensor, m_InputToInputWeightsTensor.get(),
+                                       m_InputToForgetWeightsTensor.get(),
+                                       m_InputToCellWeightsTensor.get(), m_InputToOutputWeightsTensor.get(),
+                                       m_RecurrentToInputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(),
+                                       m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(),
+                                       m_InputGateBiasTensor.get(), m_ForgetGateBiasTensor.get(),
+                                       m_CellBiasTensor.get(),
+                                       m_OutputGateBiasTensor.get(), &cellStateInTensor, &outputStateInTensor,
+                                       &cellStateOutTensor, &outputStateOutTensor);
+    }
 
     InitializeArmComputeClTensorData(*m_InputToInputWeightsTensor,      m_Data.m_InputToInputWeights);
     InitializeArmComputeClTensorData(*m_InputToForgetWeightsTensor,     m_Data.m_InputToForgetWeights);
diff --git a/src/backends/cl/workloads/ClReduceWorkload.cpp b/src/backends/cl/workloads/ClReduceWorkload.cpp
index 1a7bc64..b5f1029 100644
--- a/src/backends/cl/workloads/ClReduceWorkload.cpp
+++ b/src/backends/cl/workloads/ClReduceWorkload.cpp
@@ -60,11 +60,14 @@
     arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(input.info()->num_dimensions(),
                                                                           info.m_InputTensorInfos[0].GetNumDimensions(),
                                                                           m_Data.m_Parameters.m_vAxis);
-    m_Layer.configure(&input,
-                      &output,
-                      static_cast<unsigned int>(coords[0]),
-                      ConvertReductionOperationToAcl(m_Data.m_Parameters),
-                      m_Data.m_Parameters.m_KeepDims);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClReduceWorkload_configure");
+        m_Layer.configure(&input,
+                          &output,
+                          static_cast<unsigned int>(coords[0]),
+                          ConvertReductionOperationToAcl(m_Data.m_Parameters),
+                          m_Data.m_Parameters.m_KeepDims);
+    }
 }
 
 void ClReduceWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClReshapeWorkload.cpp b/src/backends/cl/workloads/ClReshapeWorkload.cpp
index b9b92a8..ece3166 100644
--- a/src/backends/cl/workloads/ClReshapeWorkload.cpp
+++ b/src/backends/cl/workloads/ClReshapeWorkload.cpp
@@ -31,7 +31,10 @@
     arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClReshapeWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClReshapeWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClResizeWorkload.cpp b/src/backends/cl/workloads/ClResizeWorkload.cpp
index 0c2b930..8121429 100644
--- a/src/backends/cl/workloads/ClResizeWorkload.cpp
+++ b/src/backends/cl/workloads/ClResizeWorkload.cpp
@@ -73,15 +73,18 @@
                                                  ? arm_compute::SamplingPolicy::CENTER
                                                  : arm_compute::SamplingPolicy::TOP_LEFT;
 
-    m_ResizeLayer.configure(clCompileContext,
-                            &input,
-                            &output,
-                            arm_compute::ScaleKernelInfo(aclInterpolationPolicy,
-                                                         arm_compute::BorderMode::REPLICATE,
-                                                         arm_compute::PixelValue(0.f),
-                                                         samplingPolicy,
-                                                         true,
-                                                         descriptor.m_Parameters.m_AlignCorners));
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClResizeWorkload_configure");
+        m_ResizeLayer.configure(clCompileContext,
+                                &input,
+                                &output,
+                                arm_compute::ScaleKernelInfo(aclInterpolationPolicy,
+                                                             arm_compute::BorderMode::REPLICATE,
+                                                             arm_compute::PixelValue(0.f),
+                                                             samplingPolicy,
+                                                             true,
+                                                             descriptor.m_Parameters.m_AlignCorners));
+    }
 
 };
 
diff --git a/src/backends/cl/workloads/ClRsqrtWorkload.cpp b/src/backends/cl/workloads/ClRsqrtWorkload.cpp
index 8d48bfa..b8ae2f6 100644
--- a/src/backends/cl/workloads/ClRsqrtWorkload.cpp
+++ b/src/backends/cl/workloads/ClRsqrtWorkload.cpp
@@ -33,7 +33,10 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_RsqrtLayer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClRsqrtWorkload_configure");
+        m_RsqrtLayer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClRsqrtWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSinWorkload.cpp b/src/backends/cl/workloads/ClSinWorkload.cpp
index dcde349..2989ac9 100644
--- a/src/backends/cl/workloads/ClSinWorkload.cpp
+++ b/src/backends/cl/workloads/ClSinWorkload.cpp
@@ -33,7 +33,10 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_SinLayer.configure(clCompileContext, &input, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClSinWorkload_configure");
+        m_SinLayer.configure(clCompileContext, &input, &output);
+    }
 }
 
 void ClSinWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSliceWorkload.cpp b/src/backends/cl/workloads/ClSliceWorkload.cpp
index 6f3c1a9..f92bb37 100644
--- a/src/backends/cl/workloads/ClSliceWorkload.cpp
+++ b/src/backends/cl/workloads/ClSliceWorkload.cpp
@@ -51,7 +51,10 @@
 
     std::tie(starts, ends) = SetClSliceData(m_Data.m_Parameters.m_Begin, m_Data.m_Parameters.m_Size);
 
-    m_SliceFunction.configure(clCompileContext, &input, &output, starts, ends);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClSliceWorkload_configure");
+        m_SliceFunction.configure(clCompileContext, &input, &output, starts, ends);
+    }
 }
 
 void ClSliceWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSoftmaxWorkload.cpp b/src/backends/cl/workloads/ClSoftmaxWorkload.cpp
index 0b7b10d..39684d8 100644
--- a/src/backends/cl/workloads/ClSoftmaxWorkload.cpp
+++ b/src/backends/cl/workloads/ClSoftmaxWorkload.cpp
@@ -44,7 +44,10 @@
     arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
     int aclAxis = ComputeAclAxis(m_Data.m_Parameters.m_Axis, info.m_InputTensorInfos[0]);
-    m_SoftmaxLayer.configure(clCompileContext, &input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClSoftmaxWorkload_configure");
+        m_SoftmaxLayer.configure(clCompileContext, &input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
+    }
 }
 
 void ClSoftmaxWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.cpp b/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.cpp
index 7016619..2bdfb38 100644
--- a/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.cpp
+++ b/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.cpp
@@ -76,13 +76,16 @@
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    m_SpaceToBatchLayer.configure(clCompileContext,
-                                  &input,
-                                  blockWidth,
-                                  blockHeight,
-                                  paddingLeftTop,
-                                  paddingRightBottom,
-                                  &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClSpaceToBatchNdWorkload_configure");
+        m_SpaceToBatchLayer.configure(clCompileContext,
+                                      &input,
+                                      blockWidth,
+                                      blockHeight,
+                                      paddingLeftTop,
+                                      paddingRightBottom,
+                                      &output);
+    }
 }
 
 void ClSpaceToBatchNdWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSpaceToDepthWorkload.cpp b/src/backends/cl/workloads/ClSpaceToDepthWorkload.cpp
index 119605a..a2c9026 100644
--- a/src/backends/cl/workloads/ClSpaceToDepthWorkload.cpp
+++ b/src/backends/cl/workloads/ClSpaceToDepthWorkload.cpp
@@ -39,7 +39,10 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     output.info()->set_data_layout(aclDataLayout);
 
-    m_Layer.configure(clCompileContext, &input, &output, blockSize);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClSpaceToDepthWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, &output, blockSize);
+    }
 }
 
 void ClSpaceToDepthWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSplitterWorkload.cpp b/src/backends/cl/workloads/ClSplitterWorkload.cpp
index b1ab17d..a7d8a1a 100644
--- a/src/backends/cl/workloads/ClSplitterWorkload.cpp
+++ b/src/backends/cl/workloads/ClSplitterWorkload.cpp
@@ -102,7 +102,10 @@
 
     unsigned int aclAxis = CalcAclAxis(descriptor.m_Parameters.GetNumDimensions(), *splitAxis.begin());
     auto layer = std::make_unique<arm_compute::CLSplit>();
-    layer->configure(&input, aclOutputs, aclAxis);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClSplitterWorkload_configure");
+        layer->configure(&input, aclOutputs, aclAxis);
+    }
 
     // Prepare
     layer->prepare();
diff --git a/src/backends/cl/workloads/ClStackWorkload.cpp b/src/backends/cl/workloads/ClStackWorkload.cpp
index 5070356..75842a2 100644
--- a/src/backends/cl/workloads/ClStackWorkload.cpp
+++ b/src/backends/cl/workloads/ClStackWorkload.cpp
@@ -66,7 +66,10 @@
 
     m_Layer.reset(new arm_compute::CLStackLayer());
     int aclAxis = CalcAxis(descriptor.m_Parameters.m_Axis, descriptor.m_Parameters.m_InputShape.GetNumDimensions());
-    m_Layer->configure(clCompileContext, aclInputs, aclAxis, &output);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClStackWorkload_configure");
+        m_Layer->configure(clCompileContext, aclInputs, aclAxis, &output);
+    }
 }
 
 void ClStackWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClStridedSliceWorkload.cpp b/src/backends/cl/workloads/ClStridedSliceWorkload.cpp
index 51a77c5..b2e73cb 100644
--- a/src/backends/cl/workloads/ClStridedSliceWorkload.cpp
+++ b/src/backends/cl/workloads/ClStridedSliceWorkload.cpp
@@ -85,15 +85,18 @@
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    m_StridedSliceLayer.configure(clCompileContext,
-                                  &input,
-                                  &output,
-                                  starts,
-                                  ends,
-                                  strides,
-                                  begin_mask,
-                                  end_mask,
-                                  shrink_axis_mask);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClStridedSliceWorkload_configure");
+        m_StridedSliceLayer.configure(clCompileContext,
+                                      &input,
+                                      &output,
+                                      starts,
+                                      ends,
+                                      strides,
+                                      begin_mask,
+                                      end_mask,
+                                      shrink_axis_mask);
+    }
 }
 
 void ClStridedSliceWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSubtractionWorkload.cpp b/src/backends/cl/workloads/ClSubtractionWorkload.cpp
index 6465e3e..797763d 100644
--- a/src/backends/cl/workloads/ClSubtractionWorkload.cpp
+++ b/src/backends/cl/workloads/ClSubtractionWorkload.cpp
@@ -31,7 +31,10 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    m_Layer.configure(clCompileContext, &input0, &input1, &output, g_AclConvertPolicy, activationInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClSubtractionWorkload_configure");
+        m_Layer.configure(clCompileContext, &input0, &input1, &output, g_AclConvertPolicy, activationInfo);
+    }
 }
 
 void ClSubtractionWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.cpp
index c37907e..9277bb0 100644
--- a/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.cpp
@@ -100,7 +100,11 @@
     output.info()->set_data_layout(aclDataLayout);
 
     arm_compute::PadStrideInfo padStrideInfo = BuildArmComputePadStrideInfo(m_Data.m_Parameters);
-    m_Layer.configure(clCompileContext, &input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, padStrideInfo);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClTransposeConvolution2dWorkload_configure");
+        m_Layer.configure(clCompileContext, &input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output,
+                          padStrideInfo);
+    }
 
     InitializeArmComputeClTensorData(*m_WeightsTensor, m_Data.m_Weight);
     if (m_BiasesTensor)
diff --git a/src/backends/cl/workloads/ClTransposeWorkload.cpp b/src/backends/cl/workloads/ClTransposeWorkload.cpp
index d80eae8..d52806b 100644
--- a/src/backends/cl/workloads/ClTransposeWorkload.cpp
+++ b/src/backends/cl/workloads/ClTransposeWorkload.cpp
@@ -42,11 +42,14 @@
     const arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
-    // Run the layer.
-    m_PermuteFunction.configure(clCompileContext,
-                                &input,
-                                &output,
-                                armcomputetensorutils::BuildArmComputeTransposeVector(mappings));
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClTransposeWorkload_configure");
+        // Run the layer.
+        m_PermuteFunction.configure(clCompileContext,
+                                    &input,
+                                    &output,
+                                    armcomputetensorutils::BuildArmComputeTransposeVector(mappings));
+    }
 }
 
 void ClTransposeWorkload::Execute() const