MLECO-3659: Improvement for NPU PMU counters

The NPU idle count could have been erraneously high as
the counters were always running. This change utilises
callback functions to start/stop the counters only when
the inferences start/stop executing on the NPU.

Changes have been made to cache maintenance functions
called from within the NPU driver's pipeline to reduce
the overhead caused by these.

Change-Id: I69db0d3b3f3fe5b2847e15b5c3096cb1e0484176
Signed-off-by: Kshitij Sisodia <kshitij.sisodia@arm.com>
diff --git a/source/hal/source/components/npu/ethosu_profiler.c b/source/hal/source/components/npu/ethosu_profiler.c
index b3f93da..dea704c 100644
--- a/source/hal/source/components/npu/ethosu_profiler.c
+++ b/source/hal/source/components/npu/ethosu_profiler.c
@@ -1,6 +1,6 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
- * SPDX-License-Identifier: Apache-2.0
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates
+ * <open-source-office@arm.com> SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,16 @@
  */
 
 #include "ethosu_profiler.h"
+#include "ethosu_cpu_cache.h"
 #include "log_macros.h"
 
 #include <string.h>
 
-extern struct ethosu_driver ethosu_drv;     /* Default Arm Ethos-U NPU device driver object */
-static ethosu_pmu_counters npu_counters;    /* NPU counter local instance */
-static const char* unit_beats = "beats";
+extern struct ethosu_driver ethosu_drv;    /* Default Arm Ethos-U NPU device driver object */
+static ethosu_pmu_counters s_npu_counters; /* NPU counter local instance */
+static uint32_t s_evt_mask = 0;            /* PMU event mask */
+
+static const char* unit_beats  = "beats";
 static const char* unit_cycles = "cycles";
 
 /**
@@ -31,7 +34,26 @@
  */
 static ethosu_pmu_counters* get_counter_instance(void)
 {
-    return &npu_counters;
+    return &s_npu_counters;
+}
+
+/**
+ * @brief Gets the enabled event mask from the PMU driver.
+ * @return Event mask as an unsigned 32 bit integer.
+ */
+static uint32_t get_event_mask(void)
+{
+    return s_evt_mask;
+}
+
+/**
+ * @brief Sets the enabled event mask for the PMU driver.
+ * @param[in] mask  event mask as an unsigned 32 bit integer.
+ * @return none.
+ */
+static void set_event_mask(uint32_t mask)
+{
+    s_evt_mask = mask;
 }
 
 /**
@@ -49,8 +71,8 @@
 
 void ethosu_pmu_init(void)
 {
-    uint32_t i = 0;
-    uint32_t evt_mask = ETHOSU_PMU_CCNT_Msk;
+    uint32_t i                    = 0;
+    uint32_t evt_mask             = ETHOSU_PMU_CCNT_Msk;
     ethosu_pmu_counters* counters = get_counter_instance();
     memset(counters, 0, sizeof(*counters));
 
@@ -58,31 +80,31 @@
     counters->num_total_counters = ETHOSU_PROFILER_NUM_COUNTERS;
 
 #if ETHOSU_PMU_NCOUNTERS >= 4
-    counters->npu_evt_counters[0].event_type = ETHOSU_PMU_NPU_IDLE;
+    counters->npu_evt_counters[0].event_type = ETHOSU_PMU_NPU_ACTIVE;
     counters->npu_evt_counters[0].event_mask = ETHOSU_PMU_CNT1_Msk;
-    counters->npu_evt_counters[0].name = "NPU IDLE";
-    counters->npu_evt_counters[0].unit = unit_cycles;
+    counters->npu_evt_counters[0].name       = "NPU ACTIVE";
+    counters->npu_evt_counters[0].unit       = unit_cycles;
 
     counters->npu_evt_counters[1].event_type = ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED;
     counters->npu_evt_counters[1].event_mask = ETHOSU_PMU_CNT2_Msk;
-    counters->npu_evt_counters[1].name = "NPU AXI0_RD_DATA_BEAT_RECEIVED";
-    counters->npu_evt_counters[1].unit = unit_beats;
+    counters->npu_evt_counters[1].name       = "NPU AXI0_RD_DATA_BEAT_RECEIVED";
+    counters->npu_evt_counters[1].unit       = unit_beats;
 
     counters->npu_evt_counters[2].event_type = ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN;
     counters->npu_evt_counters[2].event_mask = ETHOSU_PMU_CNT3_Msk;
-    counters->npu_evt_counters[2].name = "NPU AXI0_WR_DATA_BEAT_WRITTEN";
-    counters->npu_evt_counters[2].unit = unit_beats;
+    counters->npu_evt_counters[2].name       = "NPU AXI0_WR_DATA_BEAT_WRITTEN";
+    counters->npu_evt_counters[2].unit       = unit_beats;
 
     counters->npu_evt_counters[3].event_type = ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED;
     counters->npu_evt_counters[3].event_mask = ETHOSU_PMU_CNT4_Msk;
-    counters->npu_evt_counters[3].name = "NPU AXI1_RD_DATA_BEAT_RECEIVED";
-    counters->npu_evt_counters[3].unit = unit_beats;
+    counters->npu_evt_counters[3].name       = "NPU AXI1_RD_DATA_BEAT_RECEIVED";
+    counters->npu_evt_counters[3].unit       = unit_beats;
 #else /* ETHOSU_PMU_NCOUNTERS >= 4 */
-    #error "NPU PMU expects a minimum of 4 available event triggered counters!"
+#error "NPU PMU expects a minimum of 4 available event triggered counters!"
 #endif /* ETHOSU_PMU_NCOUNTERS >= 4 */
 
 #if ETHOSU_DERIVED_NCOUNTERS >= 1
-    counters->npu_derived_counters[0].name = "NPU ACTIVE";
+    counters->npu_derived_counters[0].name = "NPU IDLE";
     counters->npu_derived_counters[0].unit = unit_cycles;
 #endif /* ETHOSU_DERIVED_NCOUNTERS >= 1 */
 
@@ -91,16 +113,17 @@
         evt_mask |= counters->npu_evt_counters[i].event_mask;
     }
 
+    set_event_mask(evt_mask);
+
     /* Reset overflow status. */
-    ETHOSU_PMU_Set_CNTR_OVS(&ethosu_drv, evt_mask);
+    ETHOSU_PMU_Set_CNTR_OVS(&ethosu_drv, get_event_mask());
 
     /* Enable PMU. */
     ETHOSU_PMU_Enable(&ethosu_drv);
 
     /* Enable counters for cycle and event counters. */
-    ETHOSU_PMU_CNTR_Disable(&ethosu_drv, evt_mask);
+    ETHOSU_PMU_CNTR_Disable(&ethosu_drv, get_event_mask());
     ethosu_pmu_reset_counters();
-    ETHOSU_PMU_CNTR_Enable(&ethosu_drv, evt_mask);
 }
 
 /**
@@ -120,15 +143,14 @@
 ethosu_pmu_counters ethosu_get_pmu_counters(void)
 {
     ethosu_pmu_counters* counters = get_counter_instance();
-    uint32_t i = 0;
+    uint32_t i                    = 0;
 
     /* Event counters */
     for (i = 0; i < ETHOSU_PMU_NCOUNTERS; ++i) {
         if (counter_overflow(counters->npu_evt_counters[i].event_mask)) {
             warn("Counter overflow detected for %s.\n", counters->npu_evt_counters[i].name);
         }
-        counters->npu_evt_counters[i].counter_value =
-            ETHOSU_PMU_Get_EVCNTR(&ethosu_drv, i);
+        counters->npu_evt_counters[i].counter_value = ETHOSU_PMU_Get_EVCNTR(&ethosu_drv, i);
     }
 
     /* Total cycle count */
@@ -136,7 +158,9 @@
 
     /* Derived counters */
 #if ETHOSU_DERIVED_NCOUNTERS >= 1
-    if (counters->npu_evt_counters[0].event_type == ETHOSU_PMU_NPU_IDLE) {
+    if (counters->npu_evt_counters[0].event_type == ETHOSU_PMU_NPU_ACTIVE) {
+
+        /* Compute the idle count */
         counters->npu_derived_counters[0].counter_value =
             counters->npu_total_ccnt - counters->npu_evt_counters[0].counter_value;
     }
@@ -144,3 +168,17 @@
 
     return *counters;
 }
+
+void ethosu_inference_begin(struct ethosu_driver* drv, void* userArg)
+{
+    UNUSED(userArg);
+    ethosu_clear_cache_states();
+    ETHOSU_PMU_CNTR_Disable(drv, get_event_mask());
+    ETHOSU_PMU_CNTR_Enable(drv, get_event_mask());
+}
+
+void ethosu_inference_end(struct ethosu_driver* drv, void* userArg)
+{
+    UNUSED(userArg);
+    ETHOSU_PMU_CNTR_Disable(drv, get_event_mask());
+}