IVGCVSW-6249 Add ProfilingDetails Macros to all workloads in Neon

Signed-off-by: Keith Davis <keith.davis@arm.com>
Change-Id: I7be77712a9f790928219ce91222d46cc766ab9dd
diff --git a/src/backends/neon/workloads/NeonAbsWorkload.cpp b/src/backends/neon/workloads/NeonAbsWorkload.cpp
index ea14ac3..bd476be 100644
--- a/src/backends/neon/workloads/NeonAbsWorkload.cpp
+++ b/src/backends/neon/workloads/NeonAbsWorkload.cpp
@@ -35,7 +35,7 @@
 
 void NeonAbsWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonAbsWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonAbsWorkload_Execute", this->GetGuid());
     m_AbsLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonActivationWorkload.cpp b/src/backends/neon/workloads/NeonActivationWorkload.cpp
index 4b2169a..dd4c97d 100644
--- a/src/backends/neon/workloads/NeonActivationWorkload.cpp
+++ b/src/backends/neon/workloads/NeonActivationWorkload.cpp
@@ -33,6 +33,12 @@
                                                const WorkloadInfo& info)
     : BaseWorkload<ActivationQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonActivationWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonActivationWorkload", 1, 1);
 
     const arm_compute::ActivationLayerInfo activationLayerInfo =
@@ -49,7 +55,7 @@
 
 void NeonActivationWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonActivationWorkload_Execute", this->GetGuid());
     m_ActivationLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonAdditionWorkload.cpp b/src/backends/neon/workloads/NeonAdditionWorkload.cpp
index 5891677..dfbb992 100644
--- a/src/backends/neon/workloads/NeonAdditionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonAdditionWorkload.cpp
@@ -56,7 +56,7 @@
 
 void NeonAdditionWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonAdditionWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonAdditionWorkload_Execute", this->GetGuid());
     m_AddLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonArgMinMaxWorkload.cpp b/src/backends/neon/workloads/NeonArgMinMaxWorkload.cpp
index cc85791..7e9d2c7 100644
--- a/src/backends/neon/workloads/NeonArgMinMaxWorkload.cpp
+++ b/src/backends/neon/workloads/NeonArgMinMaxWorkload.cpp
@@ -56,6 +56,12 @@
                                              const WorkloadInfo& info)
         : BaseWorkload<ArgMinMaxQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonArgMinMaxWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ITensor& output = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
@@ -79,7 +85,7 @@
 
 void NeonArgMinMaxWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonArgMinMaxWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonArgMinMaxWorkload_Execute", this->GetGuid());
     m_ArgMinMaxLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
index 5da7cca..3d0a90b 100644
--- a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
+++ b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
@@ -60,6 +60,12 @@
     const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<BatchNormalizationQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchNormalizationWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonBatchNormalizationWorkload", 1, 1);
 
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -107,7 +113,7 @@
 
 void NeonBatchNormalizationWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonBatchNormalizationWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonBatchNormalizationWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonBatchToSpaceNdWorkload.cpp b/src/backends/neon/workloads/NeonBatchToSpaceNdWorkload.cpp
index 3d479ff..2a35475 100644
--- a/src/backends/neon/workloads/NeonBatchToSpaceNdWorkload.cpp
+++ b/src/backends/neon/workloads/NeonBatchToSpaceNdWorkload.cpp
@@ -19,14 +19,14 @@
 
 arm_compute::Status NeonBatchToSpaceNdWorkloadValidate(const TensorInfo& input,
                                                        const TensorInfo& output,
-                                                       const BatchToSpaceNdDescriptor& desc)
+                                                       const BatchToSpaceNdDescriptor& descriptor)
 {
-    const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, desc.m_DataLayout);
-    const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, desc.m_DataLayout);
+    const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
 
     // ArmNN blockShape is [H, W] Cl asks for W, H
-    int32_t blockHeight = armnn::numeric_cast<int32_t>(desc.m_BlockShape[0]);
-    int32_t blockWidth = armnn::numeric_cast<int32_t>(desc.m_BlockShape[1]);
+    int32_t blockHeight = armnn::numeric_cast<int32_t>(descriptor.m_BlockShape[0]);
+    int32_t blockWidth = armnn::numeric_cast<int32_t>(descriptor.m_BlockShape[1]);
 
     const arm_compute::Status aclStatus = arm_compute::NEBatchToSpaceLayer::validate(&aclInputInfo,
                                                                                      blockWidth,
@@ -35,10 +35,16 @@
     return aclStatus;
 }
 
-NeonBatchToSpaceNdWorkload::NeonBatchToSpaceNdWorkload(const BatchToSpaceNdQueueDescriptor& desc,
+NeonBatchToSpaceNdWorkload::NeonBatchToSpaceNdWorkload(const BatchToSpaceNdQueueDescriptor& descriptor,
                                                        const WorkloadInfo& info)
-    : BaseWorkload<BatchToSpaceNdQueueDescriptor>(desc, info)
+    : BaseWorkload<BatchToSpaceNdQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonBatchToSpaceWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonBatchToSpaceNdWorkload", 1, 1);
 
     arm_compute::ITensor& input  =
@@ -51,8 +57,8 @@
     output.info()->set_data_layout(aclDataLayout);
 
     // ArmNN blockShape is [H, W] Cl asks for W, H
-    int32_t blockHeight = armnn::numeric_cast<int32_t>(desc.m_Parameters.m_BlockShape[0]);
-    int32_t blockWidth = armnn::numeric_cast<int32_t>(desc.m_Parameters.m_BlockShape[1]);
+    int32_t blockHeight = armnn::numeric_cast<int32_t>(descriptor.m_Parameters.m_BlockShape[0]);
+    int32_t blockWidth = armnn::numeric_cast<int32_t>(descriptor.m_Parameters.m_BlockShape[1]);
 
     m_Layer.reset(new arm_compute::NEBatchToSpaceLayer());
     m_Layer->configure(&input, blockWidth, blockHeight, &output);
@@ -63,7 +69,7 @@
 {
     if (m_Layer)
     {
-        ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSpaceToBatchNdWorkload_Execute");
+        ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonSpaceToBatchNdWorkload_Execute", this->GetGuid());
         m_Layer->run();
     }
 }
diff --git a/src/backends/neon/workloads/NeonCastWorkload.cpp b/src/backends/neon/workloads/NeonCastWorkload.cpp
index 4727fe1..50e212e 100644
--- a/src/backends/neon/workloads/NeonCastWorkload.cpp
+++ b/src/backends/neon/workloads/NeonCastWorkload.cpp
@@ -37,7 +37,7 @@
 
 void NeonCastWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonCastWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonCastWorkload_Execute", this->GetGuid());
     m_CastLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonComparisonWorkload.cpp b/src/backends/neon/workloads/NeonComparisonWorkload.cpp
index 01a6a0c..129921a 100644
--- a/src/backends/neon/workloads/NeonComparisonWorkload.cpp
+++ b/src/backends/neon/workloads/NeonComparisonWorkload.cpp
@@ -34,6 +34,12 @@
 NeonComparisonWorkload::NeonComparisonWorkload(const ComparisonQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<ComparisonQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonComparisonWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonComparisonWorkload", 2, 1);
 
     arm_compute::ITensor& input0 = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -47,7 +53,7 @@
 
 void NeonComparisonWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonComparisonWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonComparisonWorkload_Execute", this->GetGuid());
     m_ComparisonLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonConcatWorkload.cpp b/src/backends/neon/workloads/NeonConcatWorkload.cpp
index 5cd906d..657a940 100644
--- a/src/backends/neon/workloads/NeonConcatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonConcatWorkload.cpp
@@ -18,9 +18,9 @@
 
 namespace
 {
-size_t CalcAxis(const armnn::OriginsDescriptor& desc)
+size_t CalcAxis(const armnn::OriginsDescriptor& descriptor)
 {
-    return (desc.GetNumDimensions() - desc.GetConcatAxis()) - 1;
+    return (descriptor.GetNumDimensions() - descriptor.GetConcatAxis()) - 1;
 }
 } //namespace
 
@@ -50,6 +50,12 @@
 const ConcatQueueDescriptor& descriptor, const WorkloadInfo& info)
         : BaseWorkload<ConcatQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonConcatWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     bool allInputsAreSubtensors = true;
 
     // Check that all inputs are sub-tensors
@@ -93,7 +99,7 @@
 {
     if (m_Layer)
     {
-        ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConcatWorkload_Execute");
+        ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConcatWorkload_Execute", this->GetGuid());
         m_Layer->run();
     }
 }
diff --git a/src/backends/neon/workloads/NeonConstantWorkload.cpp b/src/backends/neon/workloads/NeonConstantWorkload.cpp
index 77e4420..16bb211 100644
--- a/src/backends/neon/workloads/NeonConstantWorkload.cpp
+++ b/src/backends/neon/workloads/NeonConstantWorkload.cpp
@@ -53,7 +53,7 @@
 
 void NeonConstantWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConstantWorkload_Execute", this->GetGuid());
 
     using namespace armcomputetensorutils;
 
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
index 79d1f22..e8cc125 100644
--- a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
@@ -24,7 +24,7 @@
 
 void NeonConvertBf16ToFp32Workload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertBf16ToFp32Workload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConvertBf16ToFp32Workload_Execute", this->GetGuid());
 
     auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
         {
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
index 01f09a5..0d6bb04 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
@@ -24,7 +24,7 @@
 
 void NeonConvertFp16ToFp32Workload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp16ToFp32Workload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConvertFp16ToFp32Workload_Execute", this->GetGuid());
 
     auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
         {
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
index e1aceec..84d3c78 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
@@ -25,7 +25,7 @@
 
 void NeonConvertFp32ToBf16Workload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToBf16Workload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConvertFp32ToBf16Workload_Execute", this->GetGuid());
 
     auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
         {
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
index 62f39be..7f6d4d6 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
@@ -25,7 +25,7 @@
 
 void NeonConvertFp32ToFp16Workload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToFp16Workload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConvertFp32ToFp16Workload_Execute", this->GetGuid());
 
     auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
         {
diff --git a/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp b/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp
index a6ae99b..0b0a72c 100644
--- a/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp
+++ b/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp
@@ -131,7 +131,7 @@
     }
 
     // Report Profiling Details
-    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonConvolution2dWorkload_Execute",
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonConvolution2dWorkload_Construct",
                                          descriptor.m_Parameters,
                                          detailsInfo,
                                          this->GetGuid());
diff --git a/src/backends/neon/workloads/NeonDepthToSpaceWorkload.cpp b/src/backends/neon/workloads/NeonDepthToSpaceWorkload.cpp
index 2c4a651..76829f3 100644
--- a/src/backends/neon/workloads/NeonDepthToSpaceWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDepthToSpaceWorkload.cpp
@@ -29,10 +29,16 @@
     return arm_compute::NEDepthToSpaceLayer::validate(&aclInput, &aclOutput, blockSize);
 }
 
-NeonDepthToSpaceWorkload::NeonDepthToSpaceWorkload(const DepthToSpaceQueueDescriptor& desc,
+NeonDepthToSpaceWorkload::NeonDepthToSpaceWorkload(const DepthToSpaceQueueDescriptor& descriptor,
                                                    const WorkloadInfo& info)
-    : BaseWorkload<DepthToSpaceQueueDescriptor>(desc, info)
+    : BaseWorkload<DepthToSpaceQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonDepthToSpaceWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonDepthToSpaceWorkload", 1, 1);
 
     arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
@@ -41,7 +47,7 @@
             PolymorphicPointerDowncast<IAclTensorHandle>(m_Data.m_Inputs[0])->GetTensor();
     input.info()->set_data_layout(aclDataLayout);
 
-    int32_t blockSize = armnn::numeric_cast<int32_t>(desc.m_Parameters.m_BlockSize);
+    int32_t blockSize = armnn::numeric_cast<int32_t>(descriptor.m_Parameters.m_BlockSize);
 
     arm_compute::ITensor& output =
             PolymorphicPointerDowncast<IAclTensorHandle>(m_Data.m_Outputs[0])->GetTensor();
@@ -53,7 +59,7 @@
 
 void NeonDepthToSpaceWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthToSpaceWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonDepthToSpaceWorkload_Execute", this->GetGuid());
     m_Layer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
index 589a951..138c237 100644
--- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
@@ -33,20 +33,20 @@
                                                              const Optional<TensorInfo>& biases,
                                                              const ActivationDescriptor* activationDescriptor)
 {
-    const arm_compute::TensorInfo aclInputInfo  = BuildArmComputeTensorInfo(input,  descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
     const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
 
     // ArmNN's weight format is usually [ M, I, H, W ] but for depthwise its [ 1, H, W, I*M]
     // Permute to [ 1, I * M, H, W ] (if NCHW), as required by the compute library
     unsigned int aclDepthMultiplier;
     TensorInfo weightsPermuted;
-    std::tie(weightsPermuted, aclDepthMultiplier) = Convert1HWOTensorInfoToAcl(weights, input,descriptor.m_DataLayout);
+    std::tie(weightsPermuted, aclDepthMultiplier) = Convert1HWOTensorInfoToAcl(weights, input, descriptor.m_DataLayout);
 
     // Convert the weights into the compute library format
     const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout);
 
     arm_compute::TensorInfo aclBiasesInfo;
-    arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
+    arm_compute::TensorInfo* optionalAclBiasesInfo = nullptr;
 
     if (descriptor.m_BiasEnabled)
     {
@@ -58,10 +58,10 @@
 
     arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor);
     const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(
-            descriptor.m_DilationX,descriptor.m_DilationY);
+        descriptor.m_DilationX, descriptor.m_DilationY);
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo(
-            activationDescriptor);
+        activationDescriptor);
 
     return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo,
                                                               &aclWeightsInfo,
@@ -85,9 +85,9 @@
     unsigned int depthMultiplier;
     std::unique_ptr<unsigned char[]> permuteBuffer(new unsigned char[weightInfo.GetNumBytes()]);
     std::tie(weightsPermuted, depthMultiplier) = Convert1HWOTensorToAcl(m_Data.m_Weight,
-                                                                              info.m_InputTensorInfos[0],
-                                                                              m_Data.m_Parameters.m_DataLayout,
-                                                                              permuteBuffer.get());
+                                                                        info.m_InputTensorInfos[0],
+                                                                        m_Data.m_Parameters.m_DataLayout,
+                                                                        permuteBuffer.get());
 
     // Convert the weights into the compute library format
     m_KernelTensor = std::make_unique<arm_compute::Tensor>();
@@ -100,14 +100,14 @@
     }
 
     const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(
-                m_Data.m_Parameters.m_DilationX, m_Data.m_Parameters.m_DilationY);
+        m_Data.m_Parameters.m_DilationX, m_Data.m_Parameters.m_DilationY);
 
     m_Data.ValidateInputsOutputs("NeonDepthwiseConvolutionWorkload", 1, 1);
 
-    IAclTensorHandle* inputTensorHandle  = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0]);
+    IAclTensorHandle* inputTensorHandle = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0]);
     IAclTensorHandle* outputTensorHandle = static_cast<IAclTensorHandle*>(m_Data.m_Outputs[0]);
 
-    arm_compute::ITensor& input  = inputTensorHandle->GetTensor();
+    arm_compute::ITensor& input = inputTensorHandle->GetTensor();
     arm_compute::ITensor& output = outputTensorHandle->GetTensor();
 
     arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
@@ -129,6 +129,23 @@
                                                        activationInfo,
                                                        aclDilationInfo);
 
+    // Add details for profiling output
+    WorkloadInfo detailsInfo;
+
+    detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
+    detailsInfo.m_OutputTensorInfos = info.m_OutputTensorInfos;
+    detailsInfo.m_WeightsTensorInfo = armnn::Optional<armnn::TensorInfo>(descriptor.m_Weight->GetTensorInfo());
+    if (descriptor.m_Parameters.m_BiasEnabled)
+    {
+        detailsInfo.m_BiasTensorInfo = armnn::Optional<armnn::TensorInfo>(descriptor.m_Bias->GetTensorInfo());
+    }
+
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonDepthwiseConvolution2dWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         detailsInfo,
+                                         this->GetGuid());
+
     ARMNN_ASSERT(m_pDepthwiseConvolutionLayer);
 
     ScopedTensorHandle weightsPermutedHandle(weightsPermuted);
@@ -145,7 +162,7 @@
 
 void NeonDepthwiseConvolutionWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonDepthwiseConvolutionWorkload_Execute", this->GetGuid());
     ARMNN_ASSERT(m_pDepthwiseConvolutionLayer);
 
     m_pDepthwiseConvolutionLayer->run();
diff --git a/src/backends/neon/workloads/NeonDequantizeWorkload.cpp b/src/backends/neon/workloads/NeonDequantizeWorkload.cpp
index 07323d1..32c1134 100644
--- a/src/backends/neon/workloads/NeonDequantizeWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDequantizeWorkload.cpp
@@ -44,7 +44,7 @@
 
 void NeonDequantizeWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDequantizeWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonDequantizeWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.cpp b/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.cpp
index 36f1cd9..a9cb5c4 100644
--- a/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.cpp
@@ -14,19 +14,19 @@
 namespace armnn
 {
 
-arm_compute::DetectionPostProcessLayerInfo MakeInfo(const DetectionPostProcessDescriptor& desc)
+arm_compute::DetectionPostProcessLayerInfo MakeInfo(const DetectionPostProcessDescriptor& descriptor)
 {
-    return arm_compute::DetectionPostProcessLayerInfo(desc.m_MaxDetections,
-                                                      desc.m_MaxClassesPerDetection,
-                                                      desc.m_NmsScoreThreshold,
-                                                      desc.m_NmsIouThreshold,
-                                                      desc.m_NumClasses,
-                                                      { desc.m_ScaleX,
-                                                        desc.m_ScaleY,
-                                                        desc.m_ScaleW,
-                                                        desc.m_ScaleH },
-                                                      desc.m_UseRegularNms,
-                                                      desc.m_DetectionsPerClass);
+    return arm_compute::DetectionPostProcessLayerInfo(descriptor.m_MaxDetections,
+                                                      descriptor.m_MaxClassesPerDetection,
+                                                      descriptor.m_NmsScoreThreshold,
+                                                      descriptor.m_NmsIouThreshold,
+                                                      descriptor.m_NumClasses,
+                                                      { descriptor.m_ScaleX,
+                                                        descriptor.m_ScaleY,
+                                                        descriptor.m_ScaleW,
+                                                        descriptor.m_ScaleH },
+                                                      descriptor.m_UseRegularNms,
+                                                      descriptor.m_DetectionsPerClass);
 }
 
 arm_compute::Status NeonDetectionPostProcessValidate(const TensorInfo& boxEncodings,
@@ -36,9 +36,9 @@
                                                      const TensorInfo& detectionClasses,
                                                      const TensorInfo& detectionScores,
                                                      const TensorInfo& numDetections,
-                                                     const DetectionPostProcessDescriptor &desc)
+                                                     const DetectionPostProcessDescriptor &descriptor)
 {
-    arm_compute::DetectionPostProcessLayerInfo info = MakeInfo(desc);
+    arm_compute::DetectionPostProcessLayerInfo info = MakeInfo(descriptor);
 
     const arm_compute::TensorInfo aclBoxEncodings =
         armcomputetensorutils::BuildArmComputeTensorInfo(boxEncodings);
@@ -77,6 +77,12 @@
     const WorkloadInfo& info)
     : BaseWorkload<DetectionPostProcessQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonDetectionPostProcessWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Anchors = std::make_unique<arm_compute::Tensor>();
     BuildArmComputeTensor(*m_Anchors, descriptor.m_Anchors->GetTensorInfo());
 
@@ -104,7 +110,7 @@
 
 void NeonDetectionPostProcessWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDetectionPostProcessWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonDetectionPostProcessWorkload_Execute", this->GetGuid());
     m_Func.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.hpp b/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.hpp
index 29876ff..82ef1e2 100644
--- a/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.hpp
+++ b/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.hpp
@@ -20,7 +20,7 @@
                                                      const TensorInfo& detectionClasses,
                                                      const TensorInfo& detectionScores,
                                                      const TensorInfo& numDetections,
-                                                     const DetectionPostProcessDescriptor &desc);
+                                                     const DetectionPostProcessDescriptor &descriptor);
 
 class NeonDetectionPostProcessWorkload : public BaseWorkload<DetectionPostProcessQueueDescriptor>
 {
diff --git a/src/backends/neon/workloads/NeonDivisionWorkload.cpp b/src/backends/neon/workloads/NeonDivisionWorkload.cpp
index fa61a10..8c5d2b8 100644
--- a/src/backends/neon/workloads/NeonDivisionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDivisionWorkload.cpp
@@ -50,7 +50,7 @@
 
 void NeonDivisionWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDivisionWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonDivisionWorkload_Execute", this->GetGuid());
     m_DivLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonExpWorkload.cpp b/src/backends/neon/workloads/NeonExpWorkload.cpp
index 7baaa84..aff8e72 100644
--- a/src/backends/neon/workloads/NeonExpWorkload.cpp
+++ b/src/backends/neon/workloads/NeonExpWorkload.cpp
@@ -25,6 +25,12 @@
 NeonExpWorkload::NeonExpWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<ElementwiseUnaryQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonExpWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonExpWorkload", 1, 1);
 
     arm_compute::ITensor& input  = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -35,7 +41,7 @@
 
 void NeonExpWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonExpWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonExpWorkload_Execute", this->GetGuid());
     m_ExpLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonFillWorkload.cpp b/src/backends/neon/workloads/NeonFillWorkload.cpp
index 5965d20..0a3c7f0 100644
--- a/src/backends/neon/workloads/NeonFillWorkload.cpp
+++ b/src/backends/neon/workloads/NeonFillWorkload.cpp
@@ -19,6 +19,12 @@
 NeonFillWorkload::NeonFillWorkload(const FillQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<FillQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonFillWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonFillWorkload", 1, 1);
 
     arm_compute::ITensor& output = static_cast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
@@ -31,7 +37,7 @@
 
 void NeonFillWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFillWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonFillWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
index c49df33..d728e00 100644
--- a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
@@ -29,7 +29,7 @@
 
 void NeonFloorFloatWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFloorFloatWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonFloorFloatWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp b/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp
index 713771b..94dc077 100644
--- a/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp
+++ b/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp
@@ -19,6 +19,7 @@
 namespace armnn
 {
 using namespace armcomputetensorutils;
+using ACLMemManagerOnDemand = std::shared_ptr<arm_compute::MemoryManagerOnDemand>;
 
 arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input,
                                                        const TensorInfo& output,
@@ -32,10 +33,10 @@
     const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights);
 
     arm_compute::TensorInfo aclBiases;
-    arm_compute::TensorInfo *optionalAclBiases = nullptr;
+    arm_compute::TensorInfo* optionalAclBiases = nullptr;
     if (descriptor.m_BiasEnabled)
     {
-        aclBiases  = BuildArmComputeTensorInfo(biases);
+        aclBiases = BuildArmComputeTensorInfo(biases);
         optionalAclBiases = &aclBiases;
     }
 
@@ -50,7 +51,8 @@
 }
 
 NeonFullyConnectedWorkload::NeonFullyConnectedWorkload(const FullyConnectedQueueDescriptor& descriptor,
-    const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+                                                       const WorkloadInfo& info,
+                                                       ACLMemManagerOnDemand& memoryManager)
     : BaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("NeonFullyConnectedWorkload", 1, 1);
@@ -69,8 +71,8 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    arm_compute::FullyConnectedLayerInfo fc_info  =
-            ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor.m_Parameters, activationInfo);
+    arm_compute::FullyConnectedLayerInfo fc_info =
+        ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor.m_Parameters, activationInfo);
 
     auto layer = std::make_unique<arm_compute::NEFullyConnectedLayer>(memoryManager);
     layer->configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info);
@@ -98,6 +100,23 @@
         }
     }
 
+    // Add details for profiling output
+    WorkloadInfo detailsInfo;
+
+    detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
+    detailsInfo.m_OutputTensorInfos = info.m_OutputTensorInfos;
+    detailsInfo.m_WeightsTensorInfo = armnn::Optional<armnn::TensorInfo>(descriptor.m_Weight->GetTensorInfo());
+    if (descriptor.m_Parameters.m_BiasEnabled)
+    {
+        detailsInfo.m_BiasTensorInfo = armnn::Optional<armnn::TensorInfo>(descriptor.m_Bias->GetTensorInfo());
+    }
+
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonFullyConnectedWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         detailsInfo,
+                                         this->GetGuid());
+
     // Force Compute Library to perform the necessary copying and reshaping, after which
     // delete all the input tensors that will no longer be needed
     m_FullyConnectedLayer->prepare();
@@ -106,7 +125,7 @@
 
 void NeonFullyConnectedWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFullyConnectedWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonFullyConnectedWorkload_Execute", this->GetGuid());
     m_FullyConnectedLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonGatherWorkload.cpp b/src/backends/neon/workloads/NeonGatherWorkload.cpp
index 2c94cb5..f5c8d34 100644
--- a/src/backends/neon/workloads/NeonGatherWorkload.cpp
+++ b/src/backends/neon/workloads/NeonGatherWorkload.cpp
@@ -28,6 +28,12 @@
                                        const WorkloadInfo& info)
         : BaseWorkload<GatherQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonGatherWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonGatherWorkload", 1, 1);
 
     arm_compute::ITensor& input   = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -41,7 +47,7 @@
 
 void NeonGatherWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonGatherWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonGatherWorkload_Execute", this->GetGuid());
     m_Layer.run();
 }
 } //namespace armnn
\ No newline at end of file
diff --git a/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.cpp b/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.cpp
index 1bfd1e4..a68ea65 100644
--- a/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.cpp
+++ b/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.cpp
@@ -35,6 +35,12 @@
     const WorkloadInfo& info)
     : BaseWorkload<InstanceNormalizationQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonInstanceNormalizationWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonInstanceNormalizationWorkload", 1, 1);
 
     arm_compute::ITensor& input  = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -53,7 +59,7 @@
 
 void NeonInstanceNormalizationWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonInstanceNormalizationWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonInstanceNormalizationWorkload_Execute", this->GetGuid());
     m_Layer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
index d54607d..33b4609 100644
--- a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
@@ -32,6 +32,12 @@
     const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
     : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonL2NormalizationFloatWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonL2NormalizationFloatWorkload", 1, 1);
 
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -50,7 +56,7 @@
 
 void NeonL2NormalizationFloatWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonL2NormalizationFloatWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonL2NormalizationFloatWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonLogSoftmaxWorkload.cpp b/src/backends/neon/workloads/NeonLogSoftmaxWorkload.cpp
index ba5c900..8a97432 100644
--- a/src/backends/neon/workloads/NeonLogSoftmaxWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLogSoftmaxWorkload.cpp
@@ -35,6 +35,12 @@
                                                std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
     : BaseWorkload<LogSoftmaxQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonLogSoftmaxWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonLogSoftmaxWorkload", 1, 1);
 
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -48,7 +54,7 @@
 
 void NeonLogSoftmaxWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonLogSoftmaxWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonLogSoftmaxWorkload_Execute", this->GetGuid());
     m_LogSoftmaxLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonLogWorkload.cpp b/src/backends/neon/workloads/NeonLogWorkload.cpp
index 460f5b3..0fb8f8a 100644
--- a/src/backends/neon/workloads/NeonLogWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLogWorkload.cpp
@@ -35,7 +35,7 @@
 
 void NeonLogWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonLogWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonLogWorkload_Execute", this->GetGuid());
     m_LogLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonLogicalAndWorkload.cpp b/src/backends/neon/workloads/NeonLogicalAndWorkload.cpp
index d85e05c..179e495 100644
--- a/src/backends/neon/workloads/NeonLogicalAndWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLogicalAndWorkload.cpp
@@ -33,6 +33,12 @@
                                                const WorkloadInfo& info)
     : BaseWorkload<LogicalBinaryQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonLogicalAndWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonLogicalAndWorkload", 2, 1);
 
     arm_compute::ITensor& input0 = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -44,7 +50,7 @@
 
 void NeonLogicalAndWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonLogicalAndWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonLogicalAndWorkload_Execute", this->GetGuid());
     m_LogicalAndLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonLogicalNotWorkload.cpp b/src/backends/neon/workloads/NeonLogicalNotWorkload.cpp
index cff5eaf..16bf4e8 100644
--- a/src/backends/neon/workloads/NeonLogicalNotWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLogicalNotWorkload.cpp
@@ -31,6 +31,12 @@
                                                const WorkloadInfo& info)
     : BaseWorkload<ElementwiseUnaryQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonLogicalNotWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonLogicalNotWorkload", 1, 1);
 
     arm_compute::ITensor& input  = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -41,7 +47,7 @@
 
 void NeonLogicalNotWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonLogicalNotWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonLogicalNotWorkload_Execute", this->GetGuid());
     m_LogicalNotLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonLogicalOrWorkload.cpp b/src/backends/neon/workloads/NeonLogicalOrWorkload.cpp
index c3f21e1..301f432 100644
--- a/src/backends/neon/workloads/NeonLogicalOrWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLogicalOrWorkload.cpp
@@ -30,9 +30,15 @@
 }
 
 NeonLogicalOrWorkload::NeonLogicalOrWorkload(const LogicalBinaryQueueDescriptor& descriptor,
-                                               const WorkloadInfo& info)
+                                             const WorkloadInfo& info)
     : BaseWorkload<LogicalBinaryQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonLogicalOrWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonLogicalOrWorkload", 2, 1);
 
     arm_compute::ITensor& input0 = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -44,7 +50,7 @@
 
 void NeonLogicalOrWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonLogicalOrWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonLogicalOrWorkload_Execute", this->GetGuid());
     m_LogicalOrLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
index 175e908..f80da03 100644
--- a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
@@ -19,6 +19,12 @@
 NeonLstmFloatWorkload::NeonLstmFloatWorkload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
         : FloatWorkload<LstmQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonLstmFloatWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     arm_compute::LSTMParams<arm_compute::ITensor> lstm_param;
 
     // Basic parameters
@@ -267,6 +273,7 @@
 
 void NeonLstmFloatWorkload::Execute() const
 {
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonLstmFloatWorkload_Execute", this->GetGuid());
     m_LstmLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonMaximumWorkload.cpp b/src/backends/neon/workloads/NeonMaximumWorkload.cpp
index c4500d8..0f95af5 100644
--- a/src/backends/neon/workloads/NeonMaximumWorkload.cpp
+++ b/src/backends/neon/workloads/NeonMaximumWorkload.cpp
@@ -39,7 +39,7 @@
 
 void NeonMaximumWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMaximumWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonMaximumWorkload_Execute", this->GetGuid());
     m_MaxLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonMeanWorkload.cpp b/src/backends/neon/workloads/NeonMeanWorkload.cpp
index bb0870d..5d8d1c4 100644
--- a/src/backends/neon/workloads/NeonMeanWorkload.cpp
+++ b/src/backends/neon/workloads/NeonMeanWorkload.cpp
@@ -17,21 +17,27 @@
 
 arm_compute::Status NeonMeanWorkloadValidate(const TensorInfo& input,
                                              const TensorInfo& output,
-                                             const MeanDescriptor& desc)
+                                             const MeanDescriptor& descriptor)
 {
     const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
     const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
 
     arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
                                                                           input.GetNumDimensions(),
-                                                                          desc.m_Axis);
+                                                                          descriptor.m_Axis);
 
-    return arm_compute::NEReduceMean::validate(&aclInputInfo, coords, desc.m_KeepDims, &aclOutputInfo);
+    return arm_compute::NEReduceMean::validate(&aclInputInfo, coords, descriptor.m_KeepDims, &aclOutputInfo);
 }
 
 NeonMeanWorkload::NeonMeanWorkload(const MeanQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<MeanQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonMeanWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonMeanWorkload", 1, 1);
 
     arm_compute::ITensor& input  = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -46,7 +52,7 @@
 
 void NeonMeanWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMeanWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonMeanWorkload_Execute", this->GetGuid());
     m_Layer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonMeanWorkload.hpp b/src/backends/neon/workloads/NeonMeanWorkload.hpp
index 055b52a..5d16588 100644
--- a/src/backends/neon/workloads/NeonMeanWorkload.hpp
+++ b/src/backends/neon/workloads/NeonMeanWorkload.hpp
@@ -14,7 +14,7 @@
 
 arm_compute::Status NeonMeanWorkloadValidate(const TensorInfo& input,
                                              const TensorInfo& output,
-                                             const MeanDescriptor& desc);
+                                             const MeanDescriptor& descriptor);
 
 class NeonMeanWorkload : public BaseWorkload<MeanQueueDescriptor>
 {
diff --git a/src/backends/neon/workloads/NeonMinimumWorkload.cpp b/src/backends/neon/workloads/NeonMinimumWorkload.cpp
index 519b3c4..5212947 100644
--- a/src/backends/neon/workloads/NeonMinimumWorkload.cpp
+++ b/src/backends/neon/workloads/NeonMinimumWorkload.cpp
@@ -40,7 +40,7 @@
 
 void NeonMinimumWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMinimumWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonMinimumWorkload_Execute", this->GetGuid());
     m_MinLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonMultiplicationWorkload.cpp b/src/backends/neon/workloads/NeonMultiplicationWorkload.cpp
index e4ed195..0ec5508 100644
--- a/src/backends/neon/workloads/NeonMultiplicationWorkload.cpp
+++ b/src/backends/neon/workloads/NeonMultiplicationWorkload.cpp
@@ -77,7 +77,7 @@
 
 void NeonMultiplicationWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMultiplicationWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonMultiplicationWorkload_Execute", this->GetGuid());
     m_PixelWiseMultiplication->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonNegWorkload.cpp b/src/backends/neon/workloads/NeonNegWorkload.cpp
index 06c1467..e7705e6 100644
--- a/src/backends/neon/workloads/NeonNegWorkload.cpp
+++ b/src/backends/neon/workloads/NeonNegWorkload.cpp
@@ -35,7 +35,7 @@
 
 void NeonNegWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonNegWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonNegWorkload_Execute", this->GetGuid());
     m_NegLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
index 77fc429..92d4997 100644
--- a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
@@ -19,6 +19,7 @@
 
 namespace
 {
+using ACLMemManagerOnDemand = std::shared_ptr<arm_compute::MemoryManagerOnDemand>;
 
 bool IsNeonNormalizationDescriptorSupported(const NormalizationDescriptor& parameters,
                                             Optional<std::string&> reasonIfUnsupported)
@@ -58,10 +59,16 @@
 }
 
 NeonNormalizationFloatWorkload::NeonNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor,
-                                                   const WorkloadInfo& info,
-                                                   std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+                                                               const WorkloadInfo& info,
+                                                               ACLMemManagerOnDemand& memoryManager)
     : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonNormalizationWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonNormalizationFloatWorkload", 1, 1);
     std::string reasonIfUnsupported;
     if (!IsNeonNormalizationDescriptorSupported(m_Data.m_Parameters, Optional<std::string&>(reasonIfUnsupported)))
@@ -99,7 +106,7 @@
 
 void NeonNormalizationFloatWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonNormalizationFloatWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonNormalizationFloatWorkload_Execute", this->GetGuid());
     m_NormalizationLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonPadWorkload.cpp b/src/backends/neon/workloads/NeonPadWorkload.cpp
index 19cdefc..b378d5f 100644
--- a/src/backends/neon/workloads/NeonPadWorkload.cpp
+++ b/src/backends/neon/workloads/NeonPadWorkload.cpp
@@ -19,6 +19,12 @@
 NeonPadWorkload::NeonPadWorkload(const PadQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<PadQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonPadWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonPadWorkload", 1, 1);
 
     arm_compute::ITensor& input = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -41,7 +47,7 @@
 
 void NeonPadWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPadWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonPadWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonPermuteWorkload.cpp b/src/backends/neon/workloads/NeonPermuteWorkload.cpp
index a5ecbcb..9e18f7f 100644
--- a/src/backends/neon/workloads/NeonPermuteWorkload.cpp
+++ b/src/backends/neon/workloads/NeonPermuteWorkload.cpp
@@ -28,6 +28,12 @@
                                          const WorkloadInfo& info)
         : BaseWorkload<PermuteQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonPermuteWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     using armcomputetensorutils::BuildArmComputePermutationVector;
 
     m_Data.ValidateInputsOutputs(GetName(), 1, 1);
@@ -42,7 +48,7 @@
 
 void NeonPermuteWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON(GetName() + "_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID(GetName() + "_Execute", this->GetGuid());
     m_PermuteFunction.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonPooling2dWorkload.cpp b/src/backends/neon/workloads/NeonPooling2dWorkload.cpp
index 968d5ce..2115e93 100644
--- a/src/backends/neon/workloads/NeonPooling2dWorkload.cpp
+++ b/src/backends/neon/workloads/NeonPooling2dWorkload.cpp
@@ -37,6 +37,12 @@
     const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<Pooling2dQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonPooling2dWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonPooling2dWorkload", 1, 1);
 
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -55,7 +61,7 @@
 
 void NeonPooling2dWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonPooling2dWorkload_Execute", this->GetGuid());
     m_PoolingLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonPreluWorkload.cpp b/src/backends/neon/workloads/NeonPreluWorkload.cpp
index 8e6ea30..af03e79 100644
--- a/src/backends/neon/workloads/NeonPreluWorkload.cpp
+++ b/src/backends/neon/workloads/NeonPreluWorkload.cpp
@@ -45,7 +45,7 @@
 
 void NeonPreluWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPreluWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonPreluWorkload_Execute", this->GetGuid());
     m_PreluLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonQLstmWorkload.cpp b/src/backends/neon/workloads/NeonQLstmWorkload.cpp
index fd979d6..c25262a 100644
--- a/src/backends/neon/workloads/NeonQLstmWorkload.cpp
+++ b/src/backends/neon/workloads/NeonQLstmWorkload.cpp
@@ -17,6 +17,12 @@
 NeonQLstmWorkload::NeonQLstmWorkload(const QLstmQueueDescriptor& descriptor, const WorkloadInfo& info)
         : BaseWorkload<QLstmQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonQLstmWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     arm_compute::LSTMParams<arm_compute::ITensor> qLstmParams;
 
     // Mandatory params
@@ -230,6 +236,7 @@
 
 void NeonQLstmWorkload::Execute() const
 {
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonQuantizedLstmWorkload_Execute", this->GetGuid());
     m_QLstmLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonQuantizeWorkload.cpp b/src/backends/neon/workloads/NeonQuantizeWorkload.cpp
index 14fbdf3..f50ca81 100644
--- a/src/backends/neon/workloads/NeonQuantizeWorkload.cpp
+++ b/src/backends/neon/workloads/NeonQuantizeWorkload.cpp
@@ -43,7 +43,7 @@
 {
     if (m_Layer)
     {
-        ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonQuantizeWorkload_Execute");
+        ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonQuantizeWorkload_Execute", this->GetGuid());
         m_Layer->run();
     }
 }
diff --git a/src/backends/neon/workloads/NeonQuantizedLstmWorkload.cpp b/src/backends/neon/workloads/NeonQuantizedLstmWorkload.cpp
index d809017..e36fde4 100644
--- a/src/backends/neon/workloads/NeonQuantizedLstmWorkload.cpp
+++ b/src/backends/neon/workloads/NeonQuantizedLstmWorkload.cpp
@@ -124,6 +124,7 @@
 
 void NeonQuantizedLstmWorkload::Execute() const
 {
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonQuantizedLstmWorkload_Execute", this->GetGuid());
     m_QuantizedLstmLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonReduceWorkload.cpp b/src/backends/neon/workloads/NeonReduceWorkload.cpp
index 1436cd1..bf7ce98 100644
--- a/src/backends/neon/workloads/NeonReduceWorkload.cpp
+++ b/src/backends/neon/workloads/NeonReduceWorkload.cpp
@@ -18,28 +18,28 @@
 
 arm_compute::Status NeonReduceWorkloadValidate(const TensorInfo& input,
                                                const TensorInfo& output,
-                                               const ReduceDescriptor& desc)
+                                               const ReduceDescriptor& descriptor)
 {
-    if ( desc.m_vAxis.size()==1 || desc.m_vAxis.empty())
+    if ( descriptor.m_vAxis.size()==1 || descriptor.m_vAxis.empty())
     {
         const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
         const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
 
         arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
                                                                               input.GetNumDimensions(),
-                                                                              desc.m_vAxis);
+                                                                              descriptor.m_vAxis);
 
         return arm_compute::NEReductionOperation::validate(&aclInputInfo,
                                                            &aclOutputInfo,
                                                            static_cast<unsigned int>(coords[0]),
-                                                           ConvertReductionOperationToAcl(desc),
-                                                           desc.m_KeepDims);
+                                                           ConvertReductionOperationToAcl(descriptor),
+                                                           descriptor.m_KeepDims);
     }
     else
     {
         // Validate layer if there are multiple axes.
         arm_compute::Status status;
-        IS_MULTI_AXES_REDUCE_SUPPORTED(NeonReduceWorkloadValidate, input, desc, status);
+        IS_MULTI_AXES_REDUCE_SUPPORTED(NeonReduceWorkloadValidate, input, descriptor, status);
         return status;
     }
 }
@@ -47,6 +47,12 @@
 NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<ReduceQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonReduceWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonReduceWorkload", 1, 1);
 
     arm_compute::ITensor& input  = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -65,7 +71,7 @@
 
 void NeonReduceWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReduceWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonReduceWorkload_Execute", this->GetGuid());
     m_Layer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonReduceWorkload.hpp b/src/backends/neon/workloads/NeonReduceWorkload.hpp
index 0472091..ddeac12 100644
--- a/src/backends/neon/workloads/NeonReduceWorkload.hpp
+++ b/src/backends/neon/workloads/NeonReduceWorkload.hpp
@@ -14,7 +14,7 @@
 
 arm_compute::Status NeonReduceWorkloadValidate(const TensorInfo& input,
                                                const TensorInfo& output,
-                                               const ReduceDescriptor& desc);
+                                               const ReduceDescriptor& descriptor);
 
 class NeonReduceWorkload : public BaseWorkload<ReduceQueueDescriptor>
 {
diff --git a/src/backends/neon/workloads/NeonReshapeWorkload.cpp b/src/backends/neon/workloads/NeonReshapeWorkload.cpp
index 8b11da7..7f2f225 100644
--- a/src/backends/neon/workloads/NeonReshapeWorkload.cpp
+++ b/src/backends/neon/workloads/NeonReshapeWorkload.cpp
@@ -39,7 +39,7 @@
 
 void NeonReshapeWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonReshapeWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonResizeWorkload.cpp b/src/backends/neon/workloads/NeonResizeWorkload.cpp
index ab01e30..ecb43ae 100644
--- a/src/backends/neon/workloads/NeonResizeWorkload.cpp
+++ b/src/backends/neon/workloads/NeonResizeWorkload.cpp
@@ -53,6 +53,12 @@
                                                        const WorkloadInfo& info)
     : BaseWorkload<ResizeQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonResizeWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonResizeWorkload", 1, 1);
 
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -83,7 +89,7 @@
 
 void NeonResizeWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonResizeWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonResizeWorkload_Execute", this->GetGuid());
     m_ResizeLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonRsqrtWorkload.cpp b/src/backends/neon/workloads/NeonRsqrtWorkload.cpp
index 44980df..13615f9 100644
--- a/src/backends/neon/workloads/NeonRsqrtWorkload.cpp
+++ b/src/backends/neon/workloads/NeonRsqrtWorkload.cpp
@@ -36,7 +36,7 @@
 
 void NeonRsqrtWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonRsqrtWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonRsqrtWorkload_Execute", this->GetGuid());
     m_RsqrtLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonSinWorkload.cpp b/src/backends/neon/workloads/NeonSinWorkload.cpp
index ac2bd49..4602a9f 100644
--- a/src/backends/neon/workloads/NeonSinWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSinWorkload.cpp
@@ -35,7 +35,7 @@
 
 void NeonSinWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSinWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonSinWorkload_Execute", this->GetGuid());
     m_SinLayer.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonSliceWorkload.cpp b/src/backends/neon/workloads/NeonSliceWorkload.cpp
index 32cc042..86ae303 100644
--- a/src/backends/neon/workloads/NeonSliceWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSliceWorkload.cpp
@@ -37,6 +37,13 @@
                                      const WorkloadInfo& info)
         : BaseWorkload<SliceQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonSliceWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
+
     m_Data.ValidateInputsOutputs("NeonSliceWorkload", 1, 1);
 
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -52,7 +59,7 @@
 
 void NeonSliceWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSliceWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonSliceWorkload_Execute", this->GetGuid());
     m_SliceFunction.run();
 }
 
diff --git a/src/backends/neon/workloads/NeonSoftmaxWorkload.cpp b/src/backends/neon/workloads/NeonSoftmaxWorkload.cpp
index 505844e..da20479 100644
--- a/src/backends/neon/workloads/NeonSoftmaxWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSoftmaxWorkload.cpp
@@ -34,6 +34,12 @@
     const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
     : BaseWorkload<SoftmaxQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonSoftmaxWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonSoftmaxWorkload", 1, 1);
 
     // The ArmCompute softmax layer uses 2D input/output tensors, so flatten the first three dimensions.
@@ -48,7 +54,7 @@
 
 void NeonSoftmaxWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonSoftmaxWorkload_Execute", this->GetGuid());
     m_SoftmaxLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonSpaceToBatchNdWorkload.cpp b/src/backends/neon/workloads/NeonSpaceToBatchNdWorkload.cpp
index 42dd49c..d7880e0 100644
--- a/src/backends/neon/workloads/NeonSpaceToBatchNdWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSpaceToBatchNdWorkload.cpp
@@ -41,10 +41,16 @@
                                                       &aclOutputInfo);
 }
 
-NeonSpaceToBatchNdWorkload::NeonSpaceToBatchNdWorkload(const SpaceToBatchNdQueueDescriptor& desc,
+NeonSpaceToBatchNdWorkload::NeonSpaceToBatchNdWorkload(const SpaceToBatchNdQueueDescriptor& descriptor,
                                                        const WorkloadInfo& info)
-        : BaseWorkload<SpaceToBatchNdQueueDescriptor>(desc, info)
+        : BaseWorkload<SpaceToBatchNdQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonSpaceToBatchNdWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NESpaceToBatchNdWorkload", 1, 1);
 
     arm_compute::ITensor& input  =
@@ -79,7 +85,7 @@
 {
     if (m_Layer)
     {
-        ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSpaceToBatchNdWorkload_Execute");
+        ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonSpaceToBatchNdWorkload_Execute", this->GetGuid());
         m_Layer->run();
     }
 }
diff --git a/src/backends/neon/workloads/NeonSpaceToDepthWorkload.cpp b/src/backends/neon/workloads/NeonSpaceToDepthWorkload.cpp
index 43c991c..b96b7d0 100644
--- a/src/backends/neon/workloads/NeonSpaceToDepthWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSpaceToDepthWorkload.cpp
@@ -29,10 +29,16 @@
     return arm_compute::NESpaceToDepthLayer::validate(&aclInput, &aclOutput, blockSize);
 }
 
-NeonSpaceToDepthWorkload::NeonSpaceToDepthWorkload(const SpaceToDepthQueueDescriptor& desc,
+NeonSpaceToDepthWorkload::NeonSpaceToDepthWorkload(const SpaceToDepthQueueDescriptor& descriptor,
                                                    const WorkloadInfo& info)
-    : BaseWorkload<SpaceToDepthQueueDescriptor>(desc, info)
+    : BaseWorkload<SpaceToDepthQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonSpaceToDepthWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonSpaceToDepthWorkload", 1, 1);
 
     arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
@@ -40,7 +46,7 @@
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     input.info()->set_data_layout(aclDataLayout);
 
-    int32_t blockSize = armnn::numeric_cast<int32_t>(desc.m_Parameters.m_BlockSize);
+    int32_t blockSize = armnn::numeric_cast<int32_t>(descriptor.m_Parameters.m_BlockSize);
 
     arm_compute::ITensor& output = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     output.info()->set_data_layout(aclDataLayout);
@@ -54,7 +60,7 @@
 {
     if (m_Layer)
     {
-        ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSpaceToDepthWorkload_Execute");
+        ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonSpaceToDepthWorkload_Execute", this->GetGuid());
         m_Layer->run();
     }
 }
diff --git a/src/backends/neon/workloads/NeonSplitterWorkload.cpp b/src/backends/neon/workloads/NeonSplitterWorkload.cpp
index 4e428a2..ea1def6 100644
--- a/src/backends/neon/workloads/NeonSplitterWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSplitterWorkload.cpp
@@ -56,6 +56,12 @@
 NeonSplitterWorkload::NeonSplitterWorkload(const SplitterQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<SplitterQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonSplitterWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     bool allOutputsAreSubtensors = true;
 
     // Check that all outputs are sub-tensors
@@ -106,7 +112,7 @@
 {
     if (m_Layer)
     {
-        ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterWorkload_Execute");
+        ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonSplitterWorkload_Execute", this->GetGuid());
         m_Layer->run();
     }
 }
diff --git a/src/backends/neon/workloads/NeonStackWorkload.cpp b/src/backends/neon/workloads/NeonStackWorkload.cpp
index 0b327b8..ad9bea1 100644
--- a/src/backends/neon/workloads/NeonStackWorkload.cpp
+++ b/src/backends/neon/workloads/NeonStackWorkload.cpp
@@ -49,6 +49,12 @@
 NeonStackWorkload::NeonStackWorkload(const StackQueueDescriptor& descriptor, const WorkloadInfo& info)
 : BaseWorkload<StackQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonStackWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     std::vector<arm_compute::ITensor*> aclInputs;
     for (auto input : m_Data.m_Inputs)
     {
@@ -67,7 +73,7 @@
 {
     if (m_Layer)
     {
-        ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonStackWorkload_Execute");
+        ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonStackWorkload_Execute", this->GetGuid());
         m_Layer->run();
     }
 }
diff --git a/src/backends/neon/workloads/NeonStridedSliceWorkload.cpp b/src/backends/neon/workloads/NeonStridedSliceWorkload.cpp
index d0aee07..d9ec727 100644
--- a/src/backends/neon/workloads/NeonStridedSliceWorkload.cpp
+++ b/src/backends/neon/workloads/NeonStridedSliceWorkload.cpp
@@ -50,6 +50,12 @@
                                                    const WorkloadInfo& info)
         : BaseWorkload<StridedSliceQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonStridedSliceWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs("NeonStridedSliceWorkload", 1, 1);
 
     arm_compute::ITensor& input = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -87,7 +93,7 @@
 
 void NeonStridedSliceWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonStridedSliceWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonStridedSliceWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonSubtractionWorkload.cpp b/src/backends/neon/workloads/NeonSubtractionWorkload.cpp
index 64f68aa..68bf154 100644
--- a/src/backends/neon/workloads/NeonSubtractionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSubtractionWorkload.cpp
@@ -57,7 +57,7 @@
 
 void NeonSubtractionWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSubtractionWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonSubtractionWorkload_Execute", this->GetGuid());
     m_SubLayer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonTransposeConvolution2dWorkload.cpp b/src/backends/neon/workloads/NeonTransposeConvolution2dWorkload.cpp
index a1e545c..f9e1b36 100644
--- a/src/backends/neon/workloads/NeonTransposeConvolution2dWorkload.cpp
+++ b/src/backends/neon/workloads/NeonTransposeConvolution2dWorkload.cpp
@@ -77,6 +77,23 @@
 
     arm_compute::PadStrideInfo padStrideInfo = BuildArmComputePadStrideInfo(m_Data.m_Parameters);
 
+    // Add details for profiling output
+    WorkloadInfo detailsInfo;
+
+    detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
+    detailsInfo.m_OutputTensorInfos = info.m_OutputTensorInfos;
+    detailsInfo.m_WeightsTensorInfo = armnn::Optional<armnn::TensorInfo>(descriptor.m_Weight->GetTensorInfo());
+    if (descriptor.m_Parameters.m_BiasEnabled)
+    {
+        detailsInfo.m_BiasTensorInfo = armnn::Optional<armnn::TensorInfo>(descriptor.m_Bias->GetTensorInfo());
+    }
+
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonTransposeConvolution2dWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         detailsInfo,
+                                         this->GetGuid());
+
     m_Layer = std::make_unique<arm_compute::NEDeconvolutionLayer>(memoryManager);
     m_Layer->configure(&input, m_KernelTensor.get(), m_BiasTensor.get(), &output, padStrideInfo);
 
@@ -95,7 +112,7 @@
 
 void NeonTransposeConvolution2dWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonTransposeConvolution2dWorkload_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonTransposeConvolution2dWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
 
diff --git a/src/backends/neon/workloads/NeonTransposeWorkload.cpp b/src/backends/neon/workloads/NeonTransposeWorkload.cpp
index c11f2df..2e4f358 100644
--- a/src/backends/neon/workloads/NeonTransposeWorkload.cpp
+++ b/src/backends/neon/workloads/NeonTransposeWorkload.cpp
@@ -28,6 +28,12 @@
                                              const WorkloadInfo& info)
         : BaseWorkload<TransposeQueueDescriptor>(descriptor, info)
 {
+    // Report Profiling Details
+    ARMNN_REPORT_PROFILING_WORKLOAD_DESC("NeonTransposeWorkload_Construct",
+                                         descriptor.m_Parameters,
+                                         info,
+                                         this->GetGuid());
+
     m_Data.ValidateInputsOutputs(GetName(), 1, 1);
 
     const arm_compute::ITensor& input = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
@@ -41,7 +47,7 @@
 
 void NeonTransposeWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT_NEON(GetName() + "_Execute");
+    ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID(GetName() + "_Execute", this->GetGuid());
     m_PermuteFunction.run();
 }