MLECO-3659: Improvement for NPU PMU counters

The NPU idle count could have been erraneously high as
the counters were always running. This change utilises
callback functions to start/stop the counters only when
the inferences start/stop executing on the NPU.

Changes have been made to cache maintenance functions
called from within the NPU driver's pipeline to reduce
the overhead caused by these.

Change-Id: I69db0d3b3f3fe5b2847e15b5c3096cb1e0484176
Signed-off-by: Kshitij Sisodia <kshitij.sisodia@arm.com>
diff --git a/build_default.py b/build_default.py
index 387e9ba..e35aa1c 100755
--- a/build_default.py
+++ b/build_default.py
@@ -122,6 +122,7 @@
         + f" -DCMAKE_TOOLCHAIN_FILE={cmake_toolchain_file}"
         + f" -DETHOS_U_NPU_ID={ethos_u_cfg.ethos_u_npu_id}"
         + f" -DETHOS_U_NPU_CONFIG_ID={ethos_u_cfg.ethos_u_config_id}"
+        + f" -DTENSORFLOW_LITE_MICRO_CLEAN_DOWNLOADS=ON"
     )
 
     logging.info(f"\n\n\n{cmake_command}\n\n\n")
diff --git a/source/hal/source/components/npu/ethosu_cpu_cache.c b/source/hal/source/components/npu/ethosu_cpu_cache.c
index 0840971..d5f5e47 100644
--- a/source/hal/source/components/npu/ethosu_cpu_cache.c
+++ b/source/hal/source/components/npu/ethosu_cpu_cache.c
@@ -21,34 +21,104 @@
 #include "ethosu_driver.h"          /* Arm Ethos-U driver header */
 #include "log_macros.h"             /* Logging macros */
 
+/** Structure to maintain data cache states. */
+typedef struct _cpu_cache_state {
+    uint32_t dcache_invalidated : 1;
+    uint32_t dcache_cleaned : 1;
+} cpu_cache_state;
+
+/** Static CPU cache state object.
+ * @note This logic around flipping these states is based on the driver
+ *       calling the functions in this sequence:
+ *
+ *       Cache flush (ethosu_flush_dcache)
+ *                  ↓
+ *       Start inference (ethosu_inference_begin)
+ *                  ↓
+ *       Inference (ethosu_dev_run_command_stream)
+ *                  ↓
+ *       End inference (ethosu_inference_end)
+ *                  ↓
+ *       Cache invalidate (ethosu_dcache_invalidate)
+ **/
+static cpu_cache_state s_cache_state = {.dcache_cleaned = 0, .dcache_invalidated = 0};
+
+/**
+ * @brief   Gets the current CPU cache state.
+ * @return  Pointer to the CPU cache state object.
+ */
+static cpu_cache_state* ethosu_get_cpu_cache_state(void)
+{
+    return &s_cache_state;
+}
+
+void ethosu_clear_cache_states(void)
+{
+    cpu_cache_state* const state = ethosu_get_cpu_cache_state();
+    trace("Clearing cache state members\n");
+    state->dcache_invalidated = 0;
+    state->dcache_cleaned     = 0;
+}
+
 void ethosu_flush_dcache(uint32_t *p, size_t bytes)
 {
-#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
-    if (SCB->CCR & SCB_CCR_DC_Msk) {
-        if (p) {
-            SCB_CleanDCache_by_Addr((void *) p, (int32_t) bytes);
-        } else {
-            SCB_CleanDCache();
-        }
-    }
-#else
     UNUSED(p);
     UNUSED(bytes);
+#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
+    cpu_cache_state* const state = ethosu_get_cpu_cache_state();
+    if (SCB->CCR & SCB_CCR_DC_Msk) {
+
+        /**
+         * @note We could choose to call the `SCB_CleanDCache_by_Addr` function
+         *       here, but the sizes which this function is called for, can
+         *       cause unnecessary delays. It's worth noting that this function
+         *       is called from the Arm Ethos-U NPU drive repeatedly for each
+         *       region it accesses. This could even be RO memory which does
+         *       not need cache maintenance, along with parts of the input and
+         *       output tensors which rightly need to be cleaned. Therefore, to
+         *       reduce overhead of repeated calls for large memory sizes, we
+         *       call the clean and invalidation functions for whole cache.
+         *
+         *       If the neural network to be executed is completely falling
+         *       onto the NPU, consider disabling the data cache altogether
+         *       for the duration of the inference to further reduce the cache
+         *       maintenance burden in these functions.
+         */
+
+        /** Clean the cache if it hasn't been cleaned already  */
+        if (!state->dcache_cleaned) {
+            trace("Cleaning data cache\n");
+            SCB_CleanDCache();
+
+            /** Assert the cache cleaned state and clear the invalidation
+             *  state. */
+            state->dcache_cleaned     = 1;
+            state->dcache_invalidated = 0;
+        }
+    }
 #endif /* defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) */
 }
 
 void ethosu_invalidate_dcache(uint32_t *p, size_t bytes)
 {
-#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
-    if (SCB->CCR & SCB_CCR_DC_Msk) {
-        if (p) {
-            SCB_InvalidateDCache_by_Addr((void *) p, (int32_t) bytes);
-        } else {
-            SCB_InvalidateDCache();
-        }
-    }
-#else
     UNUSED(p);
     UNUSED(bytes);
+#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
+    cpu_cache_state* const state = ethosu_get_cpu_cache_state();
+    if (SCB->CCR & SCB_CCR_DC_Msk) {
+        /**
+         * See note in ethosu_flush_dcache function for why we clean the whole
+         * cache instead of calling it for specific addresses.
+         **/
+        if (!state->dcache_invalidated) {
+            trace("Invalidating data cache\n");
+            SCB_InvalidateDCache();
+
+            /** Assert the cache invalidation state and clear the clean
+             *  state. */
+            state->dcache_invalidated = 1;
+            state->dcache_cleaned     = 0;
+        }
+    }
 #endif /* defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) */
 }
diff --git a/source/hal/source/components/npu/ethosu_profiler.c b/source/hal/source/components/npu/ethosu_profiler.c
index b3f93da..dea704c 100644
--- a/source/hal/source/components/npu/ethosu_profiler.c
+++ b/source/hal/source/components/npu/ethosu_profiler.c
@@ -1,6 +1,6 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
- * SPDX-License-Identifier: Apache-2.0
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates
+ * <open-source-office@arm.com> SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,16 @@
  */
 
 #include "ethosu_profiler.h"
+#include "ethosu_cpu_cache.h"
 #include "log_macros.h"
 
 #include <string.h>
 
-extern struct ethosu_driver ethosu_drv;     /* Default Arm Ethos-U NPU device driver object */
-static ethosu_pmu_counters npu_counters;    /* NPU counter local instance */
-static const char* unit_beats = "beats";
+extern struct ethosu_driver ethosu_drv;    /* Default Arm Ethos-U NPU device driver object */
+static ethosu_pmu_counters s_npu_counters; /* NPU counter local instance */
+static uint32_t s_evt_mask = 0;            /* PMU event mask */
+
+static const char* unit_beats  = "beats";
 static const char* unit_cycles = "cycles";
 
 /**
@@ -31,7 +34,26 @@
  */
 static ethosu_pmu_counters* get_counter_instance(void)
 {
-    return &npu_counters;
+    return &s_npu_counters;
+}
+
+/**
+ * @brief Gets the enabled event mask from the PMU driver.
+ * @return Event mask as an unsigned 32 bit integer.
+ */
+static uint32_t get_event_mask(void)
+{
+    return s_evt_mask;
+}
+
+/**
+ * @brief Sets the enabled event mask for the PMU driver.
+ * @param[in] mask  event mask as an unsigned 32 bit integer.
+ * @return none.
+ */
+static void set_event_mask(uint32_t mask)
+{
+    s_evt_mask = mask;
 }
 
 /**
@@ -49,8 +71,8 @@
 
 void ethosu_pmu_init(void)
 {
-    uint32_t i = 0;
-    uint32_t evt_mask = ETHOSU_PMU_CCNT_Msk;
+    uint32_t i                    = 0;
+    uint32_t evt_mask             = ETHOSU_PMU_CCNT_Msk;
     ethosu_pmu_counters* counters = get_counter_instance();
     memset(counters, 0, sizeof(*counters));
 
@@ -58,31 +80,31 @@
     counters->num_total_counters = ETHOSU_PROFILER_NUM_COUNTERS;
 
 #if ETHOSU_PMU_NCOUNTERS >= 4
-    counters->npu_evt_counters[0].event_type = ETHOSU_PMU_NPU_IDLE;
+    counters->npu_evt_counters[0].event_type = ETHOSU_PMU_NPU_ACTIVE;
     counters->npu_evt_counters[0].event_mask = ETHOSU_PMU_CNT1_Msk;
-    counters->npu_evt_counters[0].name = "NPU IDLE";
-    counters->npu_evt_counters[0].unit = unit_cycles;
+    counters->npu_evt_counters[0].name       = "NPU ACTIVE";
+    counters->npu_evt_counters[0].unit       = unit_cycles;
 
     counters->npu_evt_counters[1].event_type = ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED;
     counters->npu_evt_counters[1].event_mask = ETHOSU_PMU_CNT2_Msk;
-    counters->npu_evt_counters[1].name = "NPU AXI0_RD_DATA_BEAT_RECEIVED";
-    counters->npu_evt_counters[1].unit = unit_beats;
+    counters->npu_evt_counters[1].name       = "NPU AXI0_RD_DATA_BEAT_RECEIVED";
+    counters->npu_evt_counters[1].unit       = unit_beats;
 
     counters->npu_evt_counters[2].event_type = ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN;
     counters->npu_evt_counters[2].event_mask = ETHOSU_PMU_CNT3_Msk;
-    counters->npu_evt_counters[2].name = "NPU AXI0_WR_DATA_BEAT_WRITTEN";
-    counters->npu_evt_counters[2].unit = unit_beats;
+    counters->npu_evt_counters[2].name       = "NPU AXI0_WR_DATA_BEAT_WRITTEN";
+    counters->npu_evt_counters[2].unit       = unit_beats;
 
     counters->npu_evt_counters[3].event_type = ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED;
     counters->npu_evt_counters[3].event_mask = ETHOSU_PMU_CNT4_Msk;
-    counters->npu_evt_counters[3].name = "NPU AXI1_RD_DATA_BEAT_RECEIVED";
-    counters->npu_evt_counters[3].unit = unit_beats;
+    counters->npu_evt_counters[3].name       = "NPU AXI1_RD_DATA_BEAT_RECEIVED";
+    counters->npu_evt_counters[3].unit       = unit_beats;
 #else /* ETHOSU_PMU_NCOUNTERS >= 4 */
-    #error "NPU PMU expects a minimum of 4 available event triggered counters!"
+#error "NPU PMU expects a minimum of 4 available event triggered counters!"
 #endif /* ETHOSU_PMU_NCOUNTERS >= 4 */
 
 #if ETHOSU_DERIVED_NCOUNTERS >= 1
-    counters->npu_derived_counters[0].name = "NPU ACTIVE";
+    counters->npu_derived_counters[0].name = "NPU IDLE";
     counters->npu_derived_counters[0].unit = unit_cycles;
 #endif /* ETHOSU_DERIVED_NCOUNTERS >= 1 */
 
@@ -91,16 +113,17 @@
         evt_mask |= counters->npu_evt_counters[i].event_mask;
     }
 
+    set_event_mask(evt_mask);
+
     /* Reset overflow status. */
-    ETHOSU_PMU_Set_CNTR_OVS(&ethosu_drv, evt_mask);
+    ETHOSU_PMU_Set_CNTR_OVS(&ethosu_drv, get_event_mask());
 
     /* Enable PMU. */
     ETHOSU_PMU_Enable(&ethosu_drv);
 
     /* Enable counters for cycle and event counters. */
-    ETHOSU_PMU_CNTR_Disable(&ethosu_drv, evt_mask);
+    ETHOSU_PMU_CNTR_Disable(&ethosu_drv, get_event_mask());
     ethosu_pmu_reset_counters();
-    ETHOSU_PMU_CNTR_Enable(&ethosu_drv, evt_mask);
 }
 
 /**
@@ -120,15 +143,14 @@
 ethosu_pmu_counters ethosu_get_pmu_counters(void)
 {
     ethosu_pmu_counters* counters = get_counter_instance();
-    uint32_t i = 0;
+    uint32_t i                    = 0;
 
     /* Event counters */
     for (i = 0; i < ETHOSU_PMU_NCOUNTERS; ++i) {
         if (counter_overflow(counters->npu_evt_counters[i].event_mask)) {
             warn("Counter overflow detected for %s.\n", counters->npu_evt_counters[i].name);
         }
-        counters->npu_evt_counters[i].counter_value =
-            ETHOSU_PMU_Get_EVCNTR(&ethosu_drv, i);
+        counters->npu_evt_counters[i].counter_value = ETHOSU_PMU_Get_EVCNTR(&ethosu_drv, i);
     }
 
     /* Total cycle count */
@@ -136,7 +158,9 @@
 
     /* Derived counters */
 #if ETHOSU_DERIVED_NCOUNTERS >= 1
-    if (counters->npu_evt_counters[0].event_type == ETHOSU_PMU_NPU_IDLE) {
+    if (counters->npu_evt_counters[0].event_type == ETHOSU_PMU_NPU_ACTIVE) {
+
+        /* Compute the idle count */
         counters->npu_derived_counters[0].counter_value =
             counters->npu_total_ccnt - counters->npu_evt_counters[0].counter_value;
     }
@@ -144,3 +168,17 @@
 
     return *counters;
 }
+
+void ethosu_inference_begin(struct ethosu_driver* drv, void* userArg)
+{
+    UNUSED(userArg);
+    ethosu_clear_cache_states();
+    ETHOSU_PMU_CNTR_Disable(drv, get_event_mask());
+    ETHOSU_PMU_CNTR_Enable(drv, get_event_mask());
+}
+
+void ethosu_inference_end(struct ethosu_driver* drv, void* userArg)
+{
+    UNUSED(userArg);
+    ETHOSU_PMU_CNTR_Disable(drv, get_event_mask());
+}
diff --git a/source/hal/source/components/npu/include/ethosu_cpu_cache.h b/source/hal/source/components/npu/include/ethosu_cpu_cache.h
index faf26c2..d5de3d5 100644
--- a/source/hal/source/components/npu/include/ethosu_cpu_cache.h
+++ b/source/hal/source/components/npu/include/ethosu_cpu_cache.h
@@ -21,6 +21,11 @@
 #include <stddef.h>
 
 /**
+ * @brief   Clears all the cache state members.
+ */
+void ethosu_clear_cache_states(void);
+
+/**
  * @brief   Flush/clean the data cache by address and size. Passing NULL as p argument
  *          expects the whole cache to be flushed.
  * @param[in]   p       Pointer to the start address.