MLECO-1870: Cherry pick profiling changes from dev to open source repo
* Documentation update

Change-Id: If85e7ebc44498840b291c408f14e66a5a5faa424
Signed-off-by: Isabella Gottardi <isabella.gottardi@arm.com>
diff --git a/source/application/hal/include/timer.h b/source/application/hal/include/timer.h
index 2955b7f..4a09fd4 100644
--- a/source/application/hal/include/timer.h
+++ b/source/application/hal/include/timer.h
@@ -56,11 +56,9 @@
     /* Gets difference in CPU cycle counts. */
     uint32_t (* get_cpu_cycle_diff)(time_counter *start, time_counter *end);
 
-    /* Gets the difference in terms of total NPU cycle counts. */
-    uint64_t (* get_npu_total_cycle_diff)(time_counter *start, time_counter *end);
-
-    /* Gets the difference in terms of active NPU cycle counts. */
-    uint64_t (* get_npu_active_cycle_diff)(time_counter *start, time_counter *end);
+    /* Gets the difference in terms of cycle counts for collected pmu counters. */
+    int (* get_npu_cycles_diff)(time_counter *start, time_counter *end,
+                                uint64_t* pmu_counters_values, size_t size);
 
     /* Wraps get_time_counter function with additional profiling
      * initialisation, if required. */
diff --git a/source/application/hal/platforms/bare-metal/bsp/bsp-core/include/bsp_core_log.h b/source/application/hal/platforms/bare-metal/bsp/bsp-core/include/bsp_core_log.h
index f049209..ffb55e7 100644
--- a/source/application/hal/platforms/bare-metal/bsp/bsp-core/include/bsp_core_log.h
+++ b/source/application/hal/platforms/bare-metal/bsp/bsp-core/include/bsp_core_log.h
@@ -32,31 +32,31 @@
 #endif /*LOG_LEVEL*/
 
 #if (LOG_LEVEL == LOG_LEVEL_TRACE)
-    #define trace(...)        printf("[TRACE] "); printf(__VA_ARGS__)
+    #define trace(...)        printf("TRACE - "); printf(__VA_ARGS__)
 #else
     #define trace(...)
 #endif  /* LOG_LEVEL == LOG_LEVEL_TRACE */
 
 #if (LOG_LEVEL <= LOG_LEVEL_DEBUG)
-    #define debug(...)        printf("[DEBUG] "); printf(__VA_ARGS__)
+    #define debug(...)        printf("DEBUG - "); printf(__VA_ARGS__)
 #else
     #define debug(...)
 #endif  /* LOG_LEVEL > LOG_LEVEL_TRACE */
 
 #if (LOG_LEVEL <= LOG_LEVEL_INFO)
-    #define info(...)         printf("[INFO] "); printf(__VA_ARGS__)
+    #define info(...)         printf("INFO - "); printf(__VA_ARGS__)
 #else
     #define info(...)
 #endif  /* LOG_LEVEL > LOG_LEVEL_DEBUG */
 
 #if (LOG_LEVEL <= LOG_LEVEL_WARN)
-    #define warn(...)         printf("[WARN] "); printf(__VA_ARGS__)
+    #define warn(...)         printf("WARN - "); printf(__VA_ARGS__)
 #else
     #define warn(...)
 #endif  /* LOG_LEVEL > LOG_LEVEL_INFO */
 
 #if (LOG_LEVEL <= LOG_LEVEL_ERROR)
-    #define printf_err(...)   printf("[ERROR] "); printf(__VA_ARGS__)
+    #define printf_err(...)   printf("ERROR - "); printf(__VA_ARGS__)
 #else
     #define printf_err(...)
 #endif  /* LOG_LEVEL > LOG_LEVEL_INFO */
diff --git a/source/application/hal/platforms/bare-metal/timer/baremetal_timer.c b/source/application/hal/platforms/bare-metal/timer/baremetal_timer.c
index 7257c1d..ef31a71 100644
--- a/source/application/hal/platforms/bare-metal/timer/baremetal_timer.c
+++ b/source/application/hal/platforms/bare-metal/timer/baremetal_timer.c
@@ -50,6 +50,53 @@
 static uint64_t bm_get_npu_active_cycle_diff(time_counter *st,
                                              time_counter *end);
 
+/** @brief  Gets the difference in idle NPU cycle counts
+ * @param[in]   st      Pointer to time_counter value at start time.
+ * @param[in]   end     Pointer to time_counter value at end.
+ * @return      Idle NPU cycle counts difference between the arguments expressed
+ *              as unsigned 64 bit integer.
+ **/
+static uint64_t bm_get_npu_idle_cycle_diff(time_counter *st,
+                                           time_counter *end);
+
+/** @brief  Gets the difference in axi0 bus reads cycle counts
+ * @param[in]   st      Pointer to time_counter value at start time.
+ * @param[in]   end     Pointer to time_counter value at end.
+ * @return      NPU AXI0 read cycle counts  difference between the arguments expressed
+ *              as unsigned 64 bit integer.
+ **/
+static uint64_t bm_get_npu_axi0_read_cycle_diff(time_counter *st,
+                                                time_counter *end);
+
+/** @brief  Gets the difference in axi0 bus writes cycle counts
+ * @param[in]   st      Pointer to time_counter value at start time.
+ * @param[in]   end     Pointer to time_counter value at end.
+ * @return      NPU AXI0 write cycle counts difference between the arguments expressed
+ *              as unsigned 64 bit integer.
+ **/
+static uint64_t bm_get_npu_axi0_write_cycle_diff(time_counter *st,
+                                                 time_counter *end);
+
+/** @brief  Gets the difference in axi1 bus reads cycle counts
+ * @param[in]   st      Pointer to time_counter value at start time.
+ * @param[in]   end     Pointer to time_counter value at end.
+ * @return      NPU AXI1 read cycle counts difference between the arguments expressed
+ *              as unsigned 64 bit integer.
+ **/
+static uint64_t bm_get_npu_axi1_read_cycle_diff(time_counter *st,
+                                                time_counter *end);
+
+/** @brief  Gets the difference for 6 collected cycle counts:
+ * 1) total NPU
+ * 2) active NPU
+ * 3) idle NPU
+ * 4) axi0 read
+ * 5) axi0 write
+ * 6) axi1 read
+ * */
+static int bm_get_npu_cycle_diff(time_counter *st, time_counter *end,
+                                  uint64_t* pmu_counters_values, const size_t size);
+
 #endif /* defined (ARM_NPU) */
 
 #if defined(MPS3_PLATFORM)
@@ -126,8 +173,7 @@
 #if defined (ARM_NPU)
     /* We are capable of reporting npu cycle counts. */
     timer->cap.npu_cycles   = 1;
-    timer->get_npu_total_cycle_diff = bm_get_npu_total_cycle_diff;
-    timer->get_npu_active_cycle_diff = bm_get_npu_active_cycle_diff;
+    timer->get_npu_cycles_diff = bm_get_npu_cycle_diff;
     _init_ethosu_cyclecounter();
 #endif /* defined (ARM_NPU) */
 
@@ -149,37 +195,89 @@
     /* Reset overflow status. */
     ETHOSU_PMU_Set_CNTR_OVS(ETHOSU_PMU_CNT1_Msk | ETHOSU_PMU_CCNT_Msk);
 
-    /* Set the counter #0 to count idle cycles. */
+    /* We can retrieve only 4 PMU counters: */
     ETHOSU_PMU_Set_EVTYPER(0, ETHOSU_PMU_NPU_IDLE);
+    ETHOSU_PMU_Set_EVTYPER(1, ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED);
+    ETHOSU_PMU_Set_EVTYPER(2, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN);
+    ETHOSU_PMU_Set_EVTYPER(3, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED);
 
     /* Enable PMU. */
     ETHOSU_PMU_Enable();
 
     /* Enable counters for cycle and counter# 0. */
-    ETHOSU_PMU_CNTR_Enable(ETHOSU_PMU_CNT1_Msk | ETHOSU_PMU_CCNT_Msk);
-
+    ETHOSU_PMU_CNTR_Enable(ETHOSU_PMU_CNT1_Msk | ETHOSU_PMU_CNT2_Msk | ETHOSU_PMU_CNT3_Msk | ETHOSU_PMU_CNT4_Msk| ETHOSU_PMU_CCNT_Msk);
     _reset_ethosu_counters();
 }
 
+static int bm_get_npu_cycle_diff(time_counter *st, time_counter *end,
+                                  uint64_t* pmu_counters_values, const size_t size)
+{
+    if (size == 6) {
+        pmu_counters_values[0] = bm_get_npu_total_cycle_diff(st, end);
+        pmu_counters_values[1] = bm_get_npu_active_cycle_diff(st, end);
+        pmu_counters_values[2] = bm_get_npu_idle_cycle_diff(st, end);
+        pmu_counters_values[3] = bm_get_npu_axi0_read_cycle_diff(st, end);
+        pmu_counters_values[4] = bm_get_npu_axi0_write_cycle_diff(st, end);
+        pmu_counters_values[5] = bm_get_npu_axi1_read_cycle_diff(st, end);
+        return 0;
+    } else {
+        return 1;
+    }
+}
+
 static uint64_t bm_get_npu_total_cycle_diff(time_counter *st, time_counter *end)
 {
     return end->npu_total_ccnt - st->npu_total_ccnt;
 }
 
-static uint64_t bm_get_npu_active_cycle_diff(time_counter *st, time_counter *end)
+static uint32_t counter_overflow(uint32_t pmu_counter_mask)
 {
     /* Check for overflow: The idle counter is 32 bit while the
        total cycle count is 64 bit. */
     const uint32_t overflow_status = ETHOSU_PMU_Get_CNTR_OVS();
+    return pmu_counter_mask & overflow_status;
+}
 
-    if (ETHOSU_PMU_CNT1_Msk & overflow_status) {
+static uint64_t bm_get_npu_idle_cycle_diff(time_counter *st, time_counter *end)
+{
+    if (counter_overflow(ETHOSU_PMU_CNT1_Msk)) {
         printf_err("EthosU PMU idle counter overflow.\n");
         return 0;
     }
+    return (uint64_t)(end->npu_idle_ccnt - st->npu_idle_ccnt);
+}
 
+static uint64_t bm_get_npu_active_cycle_diff(time_counter *st, time_counter *end)
+{
     /* Active NPU time = total time - idle time */
-    return (bm_get_npu_total_cycle_diff(st, end) +
-           (uint64_t)(st->npu_idle_ccnt)) - (uint64_t)(end->npu_idle_ccnt);
+    return bm_get_npu_total_cycle_diff(st, end) - bm_get_npu_idle_cycle_diff(st, end);
+}
+
+static uint64_t bm_get_npu_axi0_read_cycle_diff(time_counter *st, time_counter *end)
+{
+    if (counter_overflow(ETHOSU_PMU_CNT2_Msk)) {
+        printf_err("EthosU PMU axi0 read counter overflow.\n");
+        return 0;
+    }
+    return (uint64_t)(end->npu_axi0_read_ccnt - st->npu_axi0_read_ccnt);
+}
+
+static uint64_t bm_get_npu_axi0_write_cycle_diff(time_counter *st, time_counter *end)
+{
+    if (counter_overflow(ETHOSU_PMU_CNT3_Msk)) {
+        printf_err("EthosU PMU axi0 write counter overflow.\n");
+        return 0;
+    }
+    return (uint64_t)(end->npu_axi0_write_ccnt - st->npu_axi0_write_ccnt);
+}
+
+static uint64_t bm_get_npu_axi1_read_cycle_diff(time_counter *st, time_counter *end)
+{
+    if (counter_overflow(ETHOSU_PMU_CNT4_Msk)) {
+        printf_err("EthosU PMU axi1 read counter overflow.\n");
+        return 0;
+    }
+    return (uint64_t)(end->npu_axi1_read_ccnt - st->npu_axi1_read_ccnt);
 }
 
 #endif /* defined (ARM_NPU) */
@@ -199,15 +297,22 @@
         .counter = get_time_counter(),
 
 #if defined (ARM_NPU)
-        .npu_idle_ccnt = ETHOSU_PMU_Get_EVCNTR(0),
-        .npu_total_ccnt = ETHOSU_PMU_Get_CCNTR()
+            .npu_total_ccnt = ETHOSU_PMU_Get_CCNTR(),
+            .npu_idle_ccnt = ETHOSU_PMU_Get_EVCNTR(0),
+            .npu_axi0_read_ccnt = ETHOSU_PMU_Get_EVCNTR(1),
+            .npu_axi0_write_ccnt = ETHOSU_PMU_Get_EVCNTR(2),
+            .npu_axi1_read_ccnt = ETHOSU_PMU_Get_EVCNTR(3)
 #endif /* defined (ARM_NPU) */
 
     };
 
 #if defined (ARM_NPU)
-    debug("NPU total cc: %llu; NPU idle cc: %u\n",
-        t.npu_total_ccnt, t.npu_idle_ccnt);
+    debug("NPU total cc: %llu; NPU idle cc: %u; NPU axi0 read cc: %u;  NPU axi0 write cc: %u; NPU axi1 read cc: %u\n",
+        t.npu_total_ccnt,
+        t.npu_idle_ccnt,
+        t.npu_axi0_read_ccnt,
+        t.npu_axi0_write_ccnt,
+        t.npu_axi1_read_ccnt);
 #endif /* defined (ARM_NPU) */
 
     return t;
diff --git a/source/application/hal/platforms/bare-metal/timer/include/baremetal_timer.h b/source/application/hal/platforms/bare-metal/timer/include/baremetal_timer.h
index c8fc32c..3020dac 100644
--- a/source/application/hal/platforms/bare-metal/timer/include/baremetal_timer.h
+++ b/source/application/hal/platforms/bare-metal/timer/include/baremetal_timer.h
@@ -34,6 +34,9 @@
 #if defined (ARM_NPU)
     uint64_t                npu_total_ccnt;
     uint32_t                npu_idle_ccnt;
+    uint32_t                npu_axi0_read_ccnt;
+    uint32_t                npu_axi0_write_ccnt;
+    uint32_t                npu_axi1_read_ccnt;
 #endif /* ARM_NPU */
 
 } time_counter;
diff --git a/source/application/hal/platforms/native/data_presentation/log/log.c b/source/application/hal/platforms/native/data_presentation/log/log.c
index 48e8b95..6ce6684 100644
--- a/source/application/hal/platforms/native/data_presentation/log/log.c
+++ b/source/application/hal/platforms/native/data_presentation/log/log.c
@@ -30,11 +30,11 @@
                       const uint32_t pos_x, const uint32_t pos_y,
                       const uint32_t downsample_factor)
 {
-    info("Image details\n");
-    info("Data:                 %p\n", data);
-    info("WxHxC:                %dx%dx%d\n", width, height, channels);
-    info("Pos (x,y):            (%d,%d)\n", pos_x, pos_y);
-    info("Downsampling factor:  %u\n", downsample_factor);
+    debug("Image details\n");
+    debug("Data:                 %p\n", data);
+    debug("WxHxC:                %dx%dx%d\n", width, height, channels);
+    debug("Pos (x,y):            (%d,%d)\n", pos_x, pos_y);
+    debug("Downsampling factor:  %u\n", downsample_factor);
     return 0;
 }
 
@@ -43,8 +43,8 @@
                      const bool allow_multiple_lines)
 {
     UNUSED(allow_multiple_lines);
-    info("%s\n", str);
-    info("Text size: %lu, x: %d, y: %d\n", str_sz, pos_x, pos_y);
+    debug("%s\n", str);
+    debug("Text size: %lu, x: %d, y: %d\n", str_sz, pos_x, pos_y);
     return 0;
 }
 
@@ -53,19 +53,19 @@
                          const uint32_t width, const uint32_t height, 
                          const uint16_t color)
 {
-    info("Showing rectangular, width: %d, height: %d, color: %d, x: %d, y: %d\n", 
+    debug("Showing rectangular, width: %d, height: %d, color: %d, x: %d, y: %d\n", 
             width, height, color, pos_x, pos_y);
     return 0;
 }
 
 int log_clear(const uint16_t color)
 {
-    info("Clearing with color: %d\n", color);
+    debug("Clearing with color: %d\n", color);
     return 0;
 }
 
 int log_set_text_color (const uint16_t color)
 {
-    info("Setting text color: %d\n", color);
+    debug("Setting text color: %d\n", color);
     return 0;
 }
diff --git a/source/application/hal/platforms/native/utils/include/dummy_log.h b/source/application/hal/platforms/native/utils/include/dummy_log.h
index 626436a..3df5c5c 100644
--- a/source/application/hal/platforms/native/utils/include/dummy_log.h
+++ b/source/application/hal/platforms/native/utils/include/dummy_log.h
@@ -32,31 +32,31 @@
 #define UNUSED(x)       ((void)(x))
 
 #if (LOG_LEVEL == LOG_LEVEL_TRACE)
-    #define trace(...)        printf("[TRACE] "); printf(__VA_ARGS__)
+    #define trace(...)        printf("TRACE - "); printf(__VA_ARGS__)
 #else
     #define trace(...)
 #endif  /* LOG_LEVEL == LOG_LEVEL_TRACE */
 
 #if (LOG_LEVEL <= LOG_LEVEL_DEBUG)
-    #define debug(...)        printf("[DEBUG] "); printf(__VA_ARGS__)
+    #define debug(...)        printf("DEBUG - "); printf(__VA_ARGS__)
 #else
     #define debug(...)
 #endif  /* LOG_LEVEL > LOG_LEVEL_TRACE */
 
 #if (LOG_LEVEL <= LOG_LEVEL_INFO)
-    #define info(...)         printf("[INFO] "); printf(__VA_ARGS__)
+    #define info(...)         printf("INFO - "); printf(__VA_ARGS__)
 #else
     #define info(...)
 #endif  /* LOG_LEVEL > LOG_LEVEL_DEBUG */
 
 #if (LOG_LEVEL <= LOG_LEVEL_WARN)
-    #define warn(...)         printf("[WARN] "); printf(__VA_ARGS__)
+    #define warn(...)         printf("WARN - "); printf(__VA_ARGS__)
 #else
     #define warn(...)
 #endif  /* LOG_LEVEL > LOG_LEVEL_INFO */
 
 #if (LOG_LEVEL <= LOG_LEVEL_ERROR)
-    #define printf_err(...)   printf("[ERROR] "); printf(__VA_ARGS__)
+    #define printf_err(...)   printf("ERROR - "); printf(__VA_ARGS__)
 #else
     #define printf_err(...)
 #endif  /* LOG_LEVEL > LOG_LEVEL_INFO */
diff --git a/source/application/main/Profiler.cc b/source/application/main/Profiler.cc
index f364759..ce59d9c 100644
--- a/source/application/main/Profiler.cc
+++ b/source/application/main/Profiler.cc
@@ -17,25 +17,10 @@
 #include "Profiler.hpp"
 
 #include <cstring>
-#include <string>
-#include <sstream>
+#include <iomanip>
 
 namespace arm {
 namespace app {
-
-    template<class T>
-    static void writeStatLine(std::ostringstream& s,
-                              const char* desc,
-                              T total,
-                              uint32_t samples,
-                              T min,
-                              T max)
-    {
-        s << "\t" << desc << total << " / "
-          << ((double)total / samples) << " / "
-          << min << " / " << max << std::endl;
-    }
-
     Profiler::Profiler(hal_platform* platform, const char* name = "Unknown")
     : _m_name(name)
     {
@@ -90,99 +75,162 @@
     void Profiler::Reset()
     {
         this->_m_started = false;
+        this->_m_series.clear();
         memset(&this->_m_tstampSt, 0, sizeof(this->_m_tstampSt));
         memset(&this->_m_tstampEnd, 0, sizeof(this->_m_tstampEnd));
     }
 
-    std::string Profiler::GetResultsAndReset()
+    void calcProfilingStat(uint64_t currentValue,
+                           Statistics& data,
+                           uint32_t samples)
     {
-        std::ostringstream strResults;
+        data.total += currentValue;
+        data.min = std::min(data.min, currentValue);
+        data.max = std::max(data.max, currentValue);
+        data.avrg = ((double)data.total / samples);
+    }
 
+    void Profiler::GetAllResultsAndReset(std::vector<ProfileResult>& results)
+    {
         for (const auto& item: this->_m_series) {
             auto name = item.first;
             ProfilingSeries series = item.second;
+            ProfileResult result{};
+            result.name = item.first;
+            result.samplesNum = series.size();
 
-            uint32_t samplesNum = series.size();
-
-            uint64_t totalNpuCycles = 0;        /* Total NPU cycles (idle + active). */
-            uint64_t totalActiveNpuCycles = 0;  /* Active NPU cycles. */
-            uint64_t totalCpuCycles = 0;        /* Total CPU cycles. */
-            time_t totalTimeMs = 0;
-
-            uint64_t minActiveNpuCycles = series[0].activeNpuCycles;
-            uint64_t minIdleNpuCycles = series[0].npuCycles - minActiveNpuCycles;
-            uint64_t minActiveCpuCycles = series[0].cpuCycles - minActiveNpuCycles;
-            time_t minTimeMs = series[0].time;
-
-            uint64_t maxIdleNpuCycles = 0;
-            uint64_t maxActiveNpuCycles = 0;
-            uint64_t maxActiveCpuCycles = 0;
-            time_t maxTimeMs = 0;
-
+            Statistics AXI0_RD {
+                .name = "NPU AXI0_RD_DATA_BEAT_RECEIVED",
+                .unit = "cycles",
+                .total = 0,
+                .avrg = 0.0,
+                .min = series[0].axi0writes,
+                .max = 0
+            };
+            Statistics AXI0_WR {
+                    .name = "NPU AXI0_WR_DATA_BEAT_WRITTEN",
+                    .unit = "cycles",
+                    .total = 0,
+                    .avrg = 0.0,
+                    .min = series[0].axi0reads,
+                    .max = 0
+            };
+            Statistics AXI1_RD {
+                    .name = "NPU AXI1_RD_DATA_BEAT_RECEIVED",
+                    .unit = "cycles",
+                    .total = 0,
+                    .avrg = 0.0,
+                    .min = series[0].axi1reads,
+                    .max = 0
+            };
+            Statistics NPU_ACTIVE {
+                    .name = "NPU ACTIVE",
+                    .unit = "cycles",
+                    .total = 0,
+                    .avrg = 0.0,
+                    .min = series[0].activeNpuCycles,
+                    .max = 0
+            };
+            Statistics NPU_IDLE {
+                    .name = "NPU IDLE",
+                    .unit = "cycles",
+                    .total = 0,
+                    .avrg = 0.0,
+                    .min = series[0].idleNpuCycles,
+                    .max = 0
+            };
+            Statistics NPU_Total {
+                    .name = "NPU total",
+                    .unit = "cycles",
+                    .total = 0,
+                    .avrg = 0.0,
+                    .min = series[0].npuCycles,
+                    .max = 0,
+            };
+#if defined(CPU_PROFILE_ENABLED)
+            Statistics CPU_ACTIVE {
+                    .name = "CPU ACTIVE",
+                    .unit = "cycles (approx)",
+                    .total = 0,
+                    .avrg = 0.0,
+                    .min = series[0].cpuCycles - NPU_ACTIVE.min,
+                    .max = 0
+            };
+            Statistics TIME {
+                    .name = "Time",
+                    .unit = "ms",
+                    .total = 0,
+                    .avrg = 0.0,
+                    .min = static_cast<uint64_t>(series[0].time),
+                    .max = 0
+            };
+#endif
             for(ProfilingUnit& unit: series){
-                totalNpuCycles += unit.npuCycles;
-                totalActiveNpuCycles += unit.activeNpuCycles;
-                totalCpuCycles += unit.cpuCycles;
-                totalTimeMs += unit.time;
 
-                maxActiveNpuCycles = std::max(maxActiveNpuCycles,
-                                              unit.activeNpuCycles);
-                maxIdleNpuCycles = std::max(maxIdleNpuCycles,
-                                            unit.npuCycles - maxActiveNpuCycles);
-                maxActiveCpuCycles = std::max(maxActiveCpuCycles,
-                                              unit.cpuCycles - maxActiveNpuCycles);
-                maxTimeMs = std::max(maxTimeMs, unit.time);
+                calcProfilingStat(unit.npuCycles,
+                                  NPU_Total, result.samplesNum);
 
-                minActiveNpuCycles = std::min(minActiveNpuCycles,
-                                              unit.activeNpuCycles);
-                minIdleNpuCycles = std::min(minIdleNpuCycles,
-                                            unit.npuCycles - minActiveNpuCycles);
-                minActiveCpuCycles = std::min(minActiveCpuCycles,
-                                              unit.cpuCycles - minActiveNpuCycles);
-                minTimeMs = std::min(minTimeMs, unit.time);
-            }
+                calcProfilingStat(unit.activeNpuCycles,
+                                  NPU_ACTIVE, result.samplesNum);
 
-            strResults << "Profile for " << name << ": " << std::endl;
+                calcProfilingStat(unit.idleNpuCycles,
+                                  NPU_IDLE, result.samplesNum);
 
-            if (samplesNum > 1) {
-                strResults << "\tSamples: " << samplesNum << std::endl;
-                strResults << "\t                            Total / Avg./ Min / Max"
-                           << std::endl;
+                calcProfilingStat(unit.axi0writes,
+                                  AXI0_WR, result.samplesNum);
 
-                writeStatLine<uint64_t>(strResults, "Active NPU cycles:          ",
-                                        totalActiveNpuCycles, samplesNum,
-                                        minActiveNpuCycles, maxActiveNpuCycles);
+                calcProfilingStat(unit.axi0reads,
+                                  AXI0_RD, result.samplesNum);
 
-                writeStatLine<uint64_t>(strResults, "Idle NPU cycles:            ",
-                                        (totalNpuCycles - totalActiveNpuCycles),
-                                        samplesNum, minIdleNpuCycles, maxIdleNpuCycles);
-
+                calcProfilingStat(unit.axi1reads,
+                                  AXI1_RD, result.samplesNum);
 #if defined(CPU_PROFILE_ENABLED)
-                writeStatLine<uint64_t>(strResults, "Active CPU cycles (approx): ",
-                                        (totalCpuCycles - totalActiveNpuCycles),
-                                        samplesNum, minActiveCpuCycles,
-                                        maxActiveCpuCycles);
+                calcProfilingStat(static_cast<uint64_t>(unit.time),
+                                  TIME, result.samplesNum);
 
-                writeStatLine<time_t>(strResults, "Time in ms:                 ",
-                                        totalTimeMs, samplesNum, minTimeMs, maxTimeMs);
-#endif
-            } else {
-                strResults << "\tActive NPU cycles: " << totalActiveNpuCycles
-                           << std::endl;
-                strResults << "\tIdle NPU cycles:   "
-                           << (totalNpuCycles - totalActiveNpuCycles)
-                           << std::endl;
-#if defined(CPU_PROFILE_ENABLED)
-                strResults << "\tActive CPU cycles: "
-                           << (totalCpuCycles - totalActiveNpuCycles)
-                           << " (approx)" << std::endl;
-
-                strResults << "\tTime in ms:        " << totalTimeMs << std::endl;
+                calcProfilingStat(unit.cpuCycles - unit.activeNpuCycles,
+                                  CPU_ACTIVE, result.samplesNum);
 #endif
             }
+            result.data.emplace_back(AXI0_RD);
+            result.data.emplace_back(AXI0_WR);
+            result.data.emplace_back(AXI1_RD);
+            result.data.emplace_back(NPU_ACTIVE);
+            result.data.emplace_back(NPU_IDLE);
+            result.data.emplace_back(NPU_Total);
+#if defined(CPU_PROFILE_ENABLED)
+            result.data.emplace_back(CPU_ACTIVE);
+            result.data.emplace_back(TIME);
+#endif
+        results.emplace_back(result);
         }
         this->Reset();
-        return strResults.str();
+    }
+
+    void printStatisticsHeader(uint32_t samplesNum) {
+        info("Number of samples: %i\n", samplesNum);
+        info("%s\n", "Total / Avg./ Min / Max");
+    }
+
+    void Profiler::PrintProfilingResult(bool printFullStat) {
+        std::vector<ProfileResult> results{};
+        GetAllResultsAndReset(results);
+        for(ProfileResult& result: results) {
+            info("Profile for %s:\n", result.name.c_str());
+
+            if (printFullStat) {
+                printStatisticsHeader(result.samplesNum);
+            }
+
+            for (Statistics &stat: result.data) {
+                if (printFullStat) {
+                    info("%s %s: %llu / %.0f / %llu / %llu \n", stat.name.c_str(), stat.unit.c_str(),
+                         stat.total, stat.avrg, stat.min, stat.max);
+                } else {
+                    info("%s %s: %.0f\n", stat.name.c_str(), stat.unit.c_str(), stat.avrg);
+                }
+            }
+        }
     }
 
     void Profiler::SetName(const char* str)
@@ -197,11 +245,19 @@
 
         struct ProfilingUnit unit;
 
-        if (timer->cap.npu_cycles && timer->get_npu_total_cycle_diff &&
-            timer->get_npu_active_cycle_diff)
+        if (timer->cap.npu_cycles && timer->get_npu_cycles_diff)
         {
-            unit.npuCycles = timer->get_npu_total_cycle_diff(&start, &end);
-            unit.activeNpuCycles = timer->get_npu_active_cycle_diff(&start, &end);
+            const size_t size = 6;
+            uint64_t pmuCounters[size] = {0};
+            /* 6 values: total cc, active cc, idle cc, axi0 read, axi0 write, axi1 read*/
+            if (0 == timer->get_npu_cycles_diff(&start, &end, pmuCounters, size)) {
+                unit.npuCycles = pmuCounters[0];
+                unit.activeNpuCycles = pmuCounters[1];
+                unit.idleNpuCycles = pmuCounters[2];
+                unit.axi0reads = pmuCounters[3];
+                unit.axi0writes = pmuCounters[4];
+                unit.axi1reads = pmuCounters[5];
+            }
         }
 
         if (timer->cap.cpu_cycles && timer->get_cpu_cycle_diff) {
diff --git a/source/application/main/UseCaseCommonUtils.cc b/source/application/main/UseCaseCommonUtils.cc
index 4ea5e4d..3acf53f 100644
--- a/source/application/main/UseCaseCommonUtils.cc
+++ b/source/application/main/UseCaseCommonUtils.cc
@@ -21,16 +21,11 @@
 namespace arm {
 namespace app {
 
-    bool RunInference(hal_platform& platform, arm::app::Model& model)
+    bool RunInference(arm::app::Model& model, Profiler& profiler)
     {
-        Profiler profiler{&platform, "Inference"};
-        profiler.StartProfiling();
-
+        profiler.StartProfiling("Inference");
         bool runInf = model.RunInference();
-
         profiler.StopProfiling();
-        std::string profileResults = profiler.GetResultsAndReset();
-        info("%s\n", profileResults.c_str());
 
         return runInf;
     }
diff --git a/source/application/main/include/Profiler.hpp b/source/application/main/include/Profiler.hpp
index b16a63b..d93b257 100644
--- a/source/application/main/include/Profiler.hpp
+++ b/source/application/main/include/Profiler.hpp
@@ -26,10 +26,31 @@
 namespace arm {
 namespace app {
 
+    /** Statistics for a profiling metric. */
+    struct Statistics {
+        std::string name;
+        std::string unit;
+        std::uint64_t total;
+        double avrg;
+        std::uint64_t min;
+        std::uint64_t max;
+    };
+
+    /** Profiling results with calculated statistics. */
+    struct ProfileResult {
+        std::string name;
+        std::uint32_t samplesNum;
+        std::vector<Statistics> data;
+    };
+
     /** A single profiling unit definition. */
     struct ProfilingUnit {
         uint64_t npuCycles = 0;
         uint64_t activeNpuCycles = 0;
+        uint64_t idleNpuCycles = 0;
+        uint64_t axi0writes = 0;
+        uint64_t axi0reads = 0;
+        uint64_t axi1reads = 0;
         uint64_t cpuCycles = 0;
         time_t time = 0;
     };
@@ -73,18 +94,22 @@
         void Reset();
 
         /**
-         * @brief   Gets the results as string and resets the profiler.
-         * @returns Result string.
+         * @brief   Collects profiling results statistics and resets the profiler.
          **/
-        std::string GetResultsAndReset();
+        void GetAllResultsAndReset(std::vector<ProfileResult>& results);
+
+        /**
+         * @brief   Prints collected profiling results and resets the profiler.
+         **/
+        void PrintProfilingResult(bool printFullStat = false);
 
         /** @brief Set the profiler name. */
         void SetName(const char* str);
 
     private:
         ProfilingMap    _m_series;                /* Profiling series map. */
-        time_counter    _m_tstampSt;              /* Container for a current starting timestamp. */
-        time_counter    _m_tstampEnd;             /* Container for a current ending timestamp. */
+        time_counter    _m_tstampSt{};            /* Container for a current starting timestamp. */
+        time_counter    _m_tstampEnd{};           /* Container for a current ending timestamp. */
         hal_platform *  _m_pPlatform = nullptr;   /* Platform pointer - to get the timer. */
 
         bool            _m_started = false;       /* Indicates profiler has been started. */
diff --git a/source/application/main/include/UseCaseCommonUtils.hpp b/source/application/main/include/UseCaseCommonUtils.hpp
index 02200e8..7887aea 100644
--- a/source/application/main/include/UseCaseCommonUtils.hpp
+++ b/source/application/main/include/UseCaseCommonUtils.hpp
@@ -38,11 +38,11 @@
      * @brief           Run inference using given model
      *                  object. If profiling is enabled, it will log the
      *                  statistics too.
-     * @param[in]       platform   Reference to the hal platform object.
      * @param[in]       model      Reference to the initialised model.
+     * @param[in]       profiler   Reference to the initialised profiler.
      * @return          true if inference succeeds, false otherwise.
      **/
-    bool RunInference(hal_platform& platform, arm::app::Model& model);
+    bool RunInference(arm::app::Model& mode, Profiler& profiler);
 
     /**
      * @brief           Read input and return as an integer.
diff --git a/source/use_case/ad/src/MainLoop.cc b/source/use_case/ad/src/MainLoop.cc
index 5455b43..6a7cbe0 100644
--- a/source/use_case/ad/src/MainLoop.cc
+++ b/source/use_case/ad/src/MainLoop.cc
@@ -56,6 +56,8 @@
     /* Instantiate application context. */
     arm::app::ApplicationContext caseContext;
 
+    arm::app::Profiler profiler{&platform, "ad"};
+    caseContext.Set<arm::app::Profiler&>("profiler", profiler);
     caseContext.Set<hal_platform&>("platform", platform);
     caseContext.Set<arm::app::Model&>("model", model);
     caseContext.Set<uint32_t>("clipIndex", 0);
diff --git a/source/use_case/ad/src/UseCaseHandler.cc b/source/use_case/ad/src/UseCaseHandler.cc
index c18a0a4..1c15595 100644
--- a/source/use_case/ad/src/UseCaseHandler.cc
+++ b/source/use_case/ad/src/UseCaseHandler.cc
@@ -76,6 +76,7 @@
     bool ClassifyVibrationHandler(ApplicationContext& ctx, uint32_t clipIndex, bool runAll)
     {
         auto& platform = ctx.Get<hal_platform&>("platform");
+        auto& profiler = ctx.Get<Profiler&>("profiler");
 
         constexpr uint32_t dataPsnTxtInfStartX = 20;
         constexpr uint32_t dataPsnTxtInfStartY = 40;
@@ -98,7 +99,7 @@
         const auto frameLength = ctx.Get<int>("frameLength");
         const auto frameStride = ctx.Get<int>("frameStride");
         const auto scoreThreshold = ctx.Get<float>("scoreThreshold");
-        const float trainingMean = ctx.Get<float>("trainingMean");
+        const auto trainingMean = ctx.Get<float>("trainingMean");
         auto startClipIdx = ctx.Get<uint32_t>("clipIndex");
 
         TfLiteTensor* outputTensor = model.GetOutputTensor(0);
@@ -193,7 +194,7 @@
                      audioDataSlider.TotalStrides() + 1);
 
                 /* Run inference over this audio clip sliding window */
-                arm::app::RunInference(platform, model);
+                arm::app::RunInference(model, profiler);
 
                 /* Use the negative softmax score of the corresponding index as the outlier score */
                 std::vector<float> dequantOutput = Dequantize<int8_t>(outputTensor);
@@ -219,6 +220,8 @@
                 return false;
             }
 
+            profiler.PrintProfilingResult();
+
             _IncrementAppCtxClipIdx(ctx);
 
         } while (runAll && ctx.Get<uint32_t>("clipIndex") != startClipIdx);
diff --git a/source/use_case/asr/src/MainLoop.cc b/source/use_case/asr/src/MainLoop.cc
index ca777be..c5a26a4 100644
--- a/source/use_case/asr/src/MainLoop.cc
+++ b/source/use_case/asr/src/MainLoop.cc
@@ -96,6 +96,8 @@
     GetLabelsVector(labels);
     arm::app::AsrClassifier classifier;  /* Classifier wrapper object. */
 
+    arm::app::Profiler profiler{&platform, "asr"};
+    caseContext.Set<arm::app::Profiler&>("profiler", profiler);
     caseContext.Set<hal_platform&>("platform", platform);
     caseContext.Set<arm::app::Model&>("model", model);
     caseContext.Set<uint32_t>("clipIndex", 0);
diff --git a/source/use_case/asr/src/UseCaseHandler.cc b/source/use_case/asr/src/UseCaseHandler.cc
index e706eb8..5d3157a 100644
--- a/source/use_case/asr/src/UseCaseHandler.cc
+++ b/source/use_case/asr/src/UseCaseHandler.cc
@@ -67,6 +67,8 @@
         auto& platform = ctx.Get<hal_platform&>("platform");
         platform.data_psn->clear(COLOR_BLACK);
 
+        auto& profiler = ctx.Get<Profiler&>("profiler");
+
         /* If the request has a valid size, set the audio index. */
         if (clipIndex < NUMBER_OF_FILES) {
             if (!_SetAppCtxClipIdx(ctx, clipIndex)) {
@@ -168,18 +170,11 @@
                 info("Inference %zu/%zu\n", audioDataSlider.Index() + 1,
                      static_cast<size_t>(ceilf(audioDataSlider.FractionalTotalStrides() + 1)));
 
-                Profiler prepProfiler{&platform, "pre-processing"};
-                prepProfiler.StartProfiling();
-
                 /* Calculate MFCCs, deltas and populate the input tensor. */
                 prep.Invoke(inferenceWindow, inferenceWindowLen, inputTensor);
 
-                prepProfiler.StopProfiling();
-                std::string prepProfileResults = prepProfiler.GetResultsAndReset();
-                info("%s\n", prepProfileResults.c_str());
-
                 /* Run inference over this audio clip sliding window. */
-                arm::app::RunInference(platform, model);
+                arm::app::RunInference(model, profiler);
 
                 /* Post-process. */
                 postp.Invoke(outputTensor, reductionAxis, !audioDataSlider.HasNext());
@@ -216,6 +211,8 @@
                 return false;
             }
 
+            profiler.PrintProfilingResult();
+
             _IncrementAppCtxClipIdx(ctx);
 
         } while (runAll && ctx.Get<uint32_t>("clipIndex") != startClipIdx);
@@ -256,6 +253,8 @@
 
         platform.data_psn->set_text_color(COLOR_GREEN);
 
+        info("Final results:\n");
+        info("Total number of inferences: %zu\n", results.size());
         /* Results from multiple inferences should be combined before processing. */
         std::vector<arm::app::ClassificationResult> combinedResults;
         for (auto& result : results) {
@@ -268,8 +267,9 @@
         for (const auto & result : results) {
             std::string infResultStr = audio::asr::DecodeOutput(result.m_resultVec);
 
-            info("Result for inf %u: %s\n", result.m_inferenceNumber,
-                                            infResultStr.c_str());
+            info("For timestamp: %f (inference #: %u); label: %s\n",
+                 result.m_timeStamp, result.m_inferenceNumber,
+                 infResultStr.c_str());
         }
 
         /* Get the decoded result for the combined result. */
@@ -280,7 +280,7 @@
                             dataPsnTxtStartX1, dataPsnTxtStartY1,
                             allow_multiple_lines);
 
-        info("Final result: %s\n", finalResultStr.c_str());
+        info("Complete recognition: %s\n", finalResultStr.c_str());
         return true;
     }
 
diff --git a/source/use_case/img_class/src/MainLoop.cc b/source/use_case/img_class/src/MainLoop.cc
index 469907c..66d7064 100644
--- a/source/use_case/img_class/src/MainLoop.cc
+++ b/source/use_case/img_class/src/MainLoop.cc
@@ -58,6 +58,8 @@
     /* Instantiate application context. */
     arm::app::ApplicationContext caseContext;
 
+    arm::app::Profiler profiler{&platform, "img_class"};
+    caseContext.Set<arm::app::Profiler&>("profiler", profiler);
     caseContext.Set<hal_platform&>("platform", platform);
     caseContext.Set<arm::app::Model&>("model", model);
     caseContext.Set<uint32_t>("imgIndex", 0);
diff --git a/source/use_case/img_class/src/UseCaseHandler.cc b/source/use_case/img_class/src/UseCaseHandler.cc
index a412fec..f7e83f5 100644
--- a/source/use_case/img_class/src/UseCaseHandler.cc
+++ b/source/use_case/img_class/src/UseCaseHandler.cc
@@ -74,6 +74,7 @@
     bool ClassifyImageHandler(ApplicationContext& ctx, uint32_t imgIndex, bool runAll)
     {
         auto& platform = ctx.Get<hal_platform&>("platform");
+        auto& profiler = ctx.Get<Profiler&>("profiler");
 
         constexpr uint32_t dataPsnImgDownscaleFactor = 2;
         constexpr uint32_t dataPsnImgStartX = 10;
@@ -144,7 +145,7 @@
             info("Running inference on image %u => %s\n", ctx.Get<uint32_t>("imgIndex"),
                 get_filename(ctx.Get<uint32_t>("imgIndex")));
 
-            RunInference(platform, model);
+            RunInference(model, profiler);
 
             /* Erase. */
             str_inf = std::string(str_inf.size(), ' ');
@@ -167,6 +168,8 @@
                 return false;
             }
 
+            profiler.PrintProfilingResult();
+
             _IncrementAppCtxImageIdx(ctx);
 
         } while (runAll && ctx.Get<uint32_t>("imgIndex") != curImIdx);
@@ -230,6 +233,8 @@
         uint32_t rowIdx1 = dataPsnTxtStartY1 + 2 * dataPsnTxtYIncr;
         uint32_t rowIdx2 = dataPsnTxtStartY2;
 
+        info("Final results:\n");
+        info("Total number of inferences: 1\n");
         for (uint32_t i = 0; i < results.size(); ++i) {
             std::string resultStr =
                 std::to_string(i + 1) + ") " +
diff --git a/source/use_case/inference_runner/src/MainLoop.cc b/source/use_case/inference_runner/src/MainLoop.cc
index b110a24..26a20de 100644
--- a/source/use_case/inference_runner/src/MainLoop.cc
+++ b/source/use_case/inference_runner/src/MainLoop.cc
@@ -38,6 +38,9 @@
     /* Instantiate application context. */
     arm::app::ApplicationContext caseContext;
 
+    arm::app::Profiler profiler{&platform, "inference_runner"};
+    caseContext.Set<arm::app::Profiler&>("profiler", profiler);
+
     caseContext.Set<hal_platform&>("platform", platform);
     caseContext.Set<arm::app::Model&>("model", model);
     caseContext.Set<uint32_t>("imgIndex", 0);
diff --git a/source/use_case/inference_runner/src/UseCaseHandler.cc b/source/use_case/inference_runner/src/UseCaseHandler.cc
index ac4ea47..a2b3fb7 100644
--- a/source/use_case/inference_runner/src/UseCaseHandler.cc
+++ b/source/use_case/inference_runner/src/UseCaseHandler.cc
@@ -28,6 +28,7 @@
     bool RunInferenceHandler(ApplicationContext& ctx)
     {
         auto& platform = ctx.Get<hal_platform&>("platform");
+        auto& profiler = ctx.Get<Profiler&>("profiler");
         auto& model = ctx.Get<Model&>("model");
 
         constexpr uint32_t dataPsnTxtInfStartX = 150;
@@ -67,7 +68,7 @@
                                 str_inf.c_str(), str_inf.size(),
                                 dataPsnTxtInfStartX, dataPsnTxtInfStartY, 0);
 
-        RunInference(platform, model);
+        RunInference(model, profiler);
 
         /* Erase. */
         str_inf = std::string(str_inf.size(), ' ');
@@ -75,6 +76,9 @@
                                 str_inf.c_str(), str_inf.size(),
                                 dataPsnTxtInfStartX, dataPsnTxtInfStartY, 0);
 
+        info("Final results:\n");
+        profiler.PrintProfilingResult();
+
 #if VERIFY_TEST_OUTPUT
         for (size_t outputIndex = 0; outputIndex < model.GetNumOutputs(); outputIndex++) {
             arm::app::DumpTensor(model.GetOutputTensor(outputIndex));
diff --git a/source/use_case/kws/src/MainLoop.cc b/source/use_case/kws/src/MainLoop.cc
index 24cb939..f971c30 100644
--- a/source/use_case/kws/src/MainLoop.cc
+++ b/source/use_case/kws/src/MainLoop.cc
@@ -58,6 +58,9 @@
     /* Instantiate application context. */
     arm::app::ApplicationContext caseContext;
 
+    arm::app::Profiler profiler{&platform, "kws"};
+    caseContext.Set<arm::app::Profiler&>("profiler", profiler);
+
     caseContext.Set<hal_platform&>("platform", platform);
     caseContext.Set<arm::app::Model&>("model", model);
     caseContext.Set<uint32_t>("clipIndex", 0);
diff --git a/source/use_case/kws/src/UseCaseHandler.cc b/source/use_case/kws/src/UseCaseHandler.cc
index 872d323..d2cba55 100644
--- a/source/use_case/kws/src/UseCaseHandler.cc
+++ b/source/use_case/kws/src/UseCaseHandler.cc
@@ -82,6 +82,7 @@
     bool ClassifyAudioHandler(ApplicationContext& ctx, uint32_t clipIndex, bool runAll)
     {
         auto& platform = ctx.Get<hal_platform&>("platform");
+        auto& profiler = ctx.Get<Profiler&>("profiler");
 
         constexpr uint32_t dataPsnTxtInfStartX = 20;
         constexpr uint32_t dataPsnTxtInfStartY = 40;
@@ -215,7 +216,7 @@
                      audioDataSlider.TotalStrides() + 1);
 
                 /* Run inference over this audio clip sliding window. */
-                arm::app::RunInference(platform, model);
+                arm::app::RunInference(model, profiler);
 
                 std::vector<ClassificationResult> classificationResult;
                 auto& classifier = ctx.Get<KwsClassifier&>("classifier");
@@ -243,6 +244,8 @@
                 return false;
             }
 
+            profiler.PrintProfilingResult();
+
             _IncrementAppCtxClipIdx(ctx);
 
         } while (runAll && ctx.Get<uint32_t>("clipIndex") != startClipIdx);
@@ -281,6 +284,8 @@
         constexpr uint32_t dataPsnTxtYIncr   = 16;  /* Row index increment. */
 
         platform.data_psn->set_text_color(COLOR_GREEN);
+        info("Final results:\n");
+        info("Total number of inferences: %zu\n", results.size());
 
         /* Display each result */
         uint32_t rowIdx1 = dataPsnTxtStartY1 + 2 * dataPsnTxtYIncr;
@@ -290,7 +295,7 @@
             std::string topKeyword{"<none>"};
             float score = 0.f;
 
-            if (results[i].m_resultVec.size()) {
+            if (!results[i].m_resultVec.empty()) {
                 topKeyword = results[i].m_resultVec[0].m_label;
                 score = results[i].m_resultVec[0].m_normalisedVal;
             }
@@ -305,13 +310,20 @@
                                     dataPsnTxtStartX1, rowIdx1, false);
             rowIdx1 += dataPsnTxtYIncr;
 
-            info("For timestamp: %f (inference #: %u); threshold: %f\n",
-                    results[i].m_timeStamp, results[i].m_inferenceNumber,
-                    results[i].m_threshold);
-            for (uint32_t j = 0; j < results[i].m_resultVec.size(); ++j) {
-                info("\t\tlabel @ %u: %s, score: %f\n", j,
-                    results[i].m_resultVec[j].m_label.c_str(),
-                    results[i].m_resultVec[j].m_normalisedVal);
+            if (results[i].m_resultVec.empty()) {
+                info("For timestamp: %f (inference #: %u); label: %s; threshold: %f\n",
+                     results[i].m_timeStamp, results[i].m_inferenceNumber,
+                     topKeyword.c_str(),
+                     results[i].m_threshold);
+            } else {
+                for (uint32_t j = 0; j < results[i].m_resultVec.size(); ++j) {
+                    info("For timestamp: %f (inference #: %u); label: %s, score: %f; threshold: %f\n",
+                         results[i].m_timeStamp,
+                         results[i].m_inferenceNumber,
+                         results[i].m_resultVec[j].m_label.c_str(),
+                         results[i].m_resultVec[j].m_normalisedVal,
+                         results[i].m_threshold);
+                }
             }
         }
 
diff --git a/source/use_case/kws_asr/src/MainLoop.cc b/source/use_case/kws_asr/src/MainLoop.cc
index 37146c9..95e5a8f 100644
--- a/source/use_case/kws_asr/src/MainLoop.cc
+++ b/source/use_case/kws_asr/src/MainLoop.cc
@@ -101,6 +101,9 @@
     /* Instantiate application context. */
     arm::app::ApplicationContext caseContext;
 
+    arm::app::Profiler profiler{&platform, "kws_asr"};
+    caseContext.Set<arm::app::Profiler&>("profiler", profiler);
+
     caseContext.Set<hal_platform&>("platform", platform);
     caseContext.Set<arm::app::Model&>("kwsmodel", kwsModel);
     caseContext.Set<arm::app::Model&>("asrmodel", asrModel);
diff --git a/source/use_case/kws_asr/src/UseCaseHandler.cc b/source/use_case/kws_asr/src/UseCaseHandler.cc
index c50796f..a428210 100644
--- a/source/use_case/kws_asr/src/UseCaseHandler.cc
+++ b/source/use_case/kws_asr/src/UseCaseHandler.cc
@@ -127,6 +127,7 @@
 
         KWSOutput output;
 
+        auto& profiler = ctx.Get<Profiler&>("profiler");
         auto& kwsModel = ctx.Get<Model&>("kwsmodel");
         if (!kwsModel.IsInited()) {
             printf_err("KWS model has not been initialised\n");
@@ -243,7 +244,7 @@
                  audioDataSlider.TotalStrides() + 1);
 
             /* Run inference over this audio clip sliding window. */
-            arm::app::RunInference(platform, kwsModel);
+            arm::app::RunInference(kwsModel, profiler);
 
             std::vector<ClassificationResult> kwsClassificationResult;
             auto& kwsClassifier = ctx.Get<KwsClassifier&>("kwsclassifier");
@@ -284,6 +285,8 @@
             return output;
         }
 
+        profiler.PrintProfilingResult();
+
         output.executionSuccess = true;
         return output;
     }
@@ -300,6 +303,7 @@
         constexpr uint32_t dataPsnTxtInfStartX = 20;
         constexpr uint32_t dataPsnTxtInfStartY = 40;
 
+        auto& profiler = ctx.Get<Profiler&>("profiler");
         auto& platform = ctx.Get<hal_platform&>("platform");
         platform.data_psn->clear(COLOR_BLACK);
 
@@ -389,18 +393,11 @@
             info("Inference %zu/%zu\n", audioDataSlider.Index() + 1,
                 static_cast<size_t>(ceilf(audioDataSlider.FractionalTotalStrides() + 1)));
 
-            Profiler prepProfiler{&platform, "pre-processing"};
-            prepProfiler.StartProfiling();
-
             /* Calculate MFCCs, deltas and populate the input tensor. */
             asrPrep.Invoke(asrInferenceWindow, asrInferenceWindowLen, asrInputTensor);
 
-            prepProfiler.StopProfiling();
-            std::string prepProfileResults = prepProfiler.GetResultsAndReset();
-            info("%s\n", prepProfileResults.c_str());
-
             /* Run inference over this audio clip sliding window. */
-            arm::app::RunInference(platform, asrModel);
+            arm::app::RunInference(asrModel, profiler);
 
             /* Post-process. */
             asrPostp.Invoke(asrOutputTensor, reductionAxis, !audioDataSlider.HasNext());
@@ -432,6 +429,8 @@
             return false;
         }
 
+        profiler.PrintProfilingResult();
+
         return true;
     }