Use lbl-profiler to setup PMU and PMU events

Adapt layer-by-layer profiler to handle PMU event configuration,
PMU setup and retrieving PMU counter values.

Adapt the inference process application to support the lbl-profiler
PMU setup and retrieve/save PMU counter values in the InferenceJob
struct.

Change-Id: I1667a5b11c43c54e7d28232b594dd118bf3f79a8
diff --git a/applications/inference_process/include/inference_process.hpp b/applications/inference_process/include/inference_process.hpp
index db57811..1679e19 100644
--- a/applications/inference_process/include/inference_process.hpp
+++ b/applications/inference_process/include/inference_process.hpp
@@ -43,7 +43,7 @@
     std::vector<DataPtr> expectedOutput;
     size_t numBytesToPrint;
     std::vector<uint8_t> pmuEventConfig;
-    uint32_t pmuCycleCounterEnable;
+    bool pmuCycleCounterEnable;
     std::vector<uint32_t> pmuEventCount;
     uint64_t pmuCycleCounterCount;
 
@@ -55,7 +55,7 @@
                  const std::vector<DataPtr> &expectedOutput,
                  size_t numBytesToPrint,
                  const std::vector<uint8_t> &pmuEventConfig,
-                 const uint32_t pmuCycleCounterEnable);
+                 const bool pmuCycleCounterEnable);
 
     void invalidate();
     void clean();
diff --git a/applications/inference_process/src/inference_process.cpp b/applications/inference_process/src/inference_process.cpp
index 4990e62..94c62d3 100644
--- a/applications/inference_process/src/inference_process.cpp
+++ b/applications/inference_process/src/inference_process.cpp
@@ -80,10 +80,10 @@
 };
 
 void print_output_data(TfLiteTensor *output, size_t bytesToPrint) {
-    constexpr auto crc = Crc();
+    constexpr auto crc          = Crc();
     const uint32_t output_crc32 = crc.crc32(output->data.data, output->bytes);
-    const int numBytesToPrint = min(output->bytes, bytesToPrint);
-    int dims_size             = output->dims->size;
+    const int numBytesToPrint   = min(output->bytes, bytesToPrint);
+    int dims_size               = output->dims->size;
     LOG("{\n");
     LOG("\"dims\": [%d,", dims_size);
     for (int i = 0; i < output->dims->size - 1; ++i) {
@@ -91,15 +91,14 @@
     }
     LOG("%d],\n", output->dims->data[dims_size - 1]);
     LOG("\"data_address\": \"%08" PRIx32 "\",\n", (uint32_t)output->data.data);
-    if (numBytesToPrint)
-    {
+    if (numBytesToPrint) {
         LOG("\"crc32\": \"%08" PRIx32 "\",\n", output_crc32);
         LOG("\"data\":\"");
         for (int i = 0; i < numBytesToPrint - 1; ++i) {
             /*
-            * Workaround an issue when compiling with GCC where by
-            * printing only a '\n' the produced global output is wrong.
-            */
+             * Workaround an issue when compiling with GCC where by
+             * printing only a '\n' the produced global output is wrong.
+             */
             if (i % 15 == 0 && i != 0) {
                 LOG("0x%02x,\n", output->data.uint8[i]);
             } else {
@@ -107,8 +106,7 @@
             }
         }
         LOG("0x%02x\"\n", output->data.uint8[numBytesToPrint - 1]);
-    }
-    else {
+    } else {
         LOG("\"crc32\": \"%08" PRIx32 "\"\n", output_crc32);
     }
     LOG("}");
@@ -156,7 +154,7 @@
                            const vector<DataPtr> &_expectedOutput,
                            size_t _numBytesToPrint,
                            const vector<uint8_t> &_pmuEventConfig,
-                           const uint32_t _pmuCycleCounterEnable) :
+                           const bool _pmuCycleCounterEnable) :
     name(_name),
     networkModel(_networkModel), input(_input), output(_output), expectedOutput(_expectedOutput),
     numBytesToPrint(_numBytesToPrint), pmuEventConfig(_pmuEventConfig), pmuCycleCounterEnable(_pmuCycleCounterEnable),
@@ -250,7 +248,7 @@
     // Create the TFL micro interpreter
     tflite::AllOpsResolver resolver;
 #ifdef LAYER_BY_LAYER_PROFILER
-    tflite::LayerByLayerProfiler profiler;
+    tflite::LayerByLayerProfiler profiler(job.pmuEventConfig, job.pmuCycleCounterEnable);
 #else
     tflite::ArmProfiler profiler;
 #endif
@@ -308,9 +306,12 @@
 
     LOG("Inference runtime: %u cycles\n", (unsigned int)profiler.GetTotalTicks());
 
-    if (job.pmuCycleCounterEnable != 0) {
-        job.pmuCycleCounterCount = profiler.GetTotalTicks();
+#ifdef LAYER_BY_LAYER_PROFILER
+    if (job.pmuCycleCounterEnable) {
+        job.pmuCycleCounterCount = profiler.GetPmuCycleCounterCount();
     }
+    job.pmuEventCount.assign(profiler.GetPmuEventCount().begin(), profiler.GetPmuEventCount().end());
+#endif
 
     // Copy output data
     if (job.output.size() > 0) {
diff --git a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
index 8e8dc0f..37bd868 100644
--- a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
+++ b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
@@ -23,6 +23,7 @@
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include <memory>
 #include <pmu_ethosu.h>
+#include <vector>
 
 // NOTE: This profiler only works on systems with 1 NPU due to the use of
 // ethosu_reserve_driver().
@@ -30,22 +31,32 @@
 class LayerByLayerProfiler : public MicroProfiler {
 public:
     enum Backend { PRINTF, EVENT_RECORDER };
-    LayerByLayerProfiler(size_t max_events = 200,
-                         Backend backend   = PRINTF,
-                         int32_t event_id  = EventID(EventLevelError, EvtStatistics_No, EventRecordNone));
+    LayerByLayerProfiler(const std::vector<uint8_t> &event_config = {},
+                         bool pmu_cycle_counter_enable            = true,
+                         size_t max_events                        = 200,
+                         Backend backend                          = PRINTF,
+                         int32_t event_id = EventID(EventLevelError, EvtStatistics_No, EventRecordNone));
     uint32_t BeginEvent(const char *tag);
     void EndEvent(uint32_t event_handle);
     uint64_t GetTotalTicks() const;
     void Log() const;
 
+    uint64_t GetPmuCycleCounterCount() const;
+    const std::vector<uint32_t> &GetPmuEventCount() const;
+
 private:
-    size_t max_events_;
     std::unique_ptr<const char *[]> tags_;
     std::unique_ptr<uint64_t[]> start_ticks_;
     std::unique_ptr<uint64_t[]> end_ticks_;
 
-    Backend backend_;
-    int32_t event_id_;
+    std::vector<uint8_t> pmu_event_config;
+    std::vector<uint32_t> pmu_event_count;
+    bool pmu_cycle_counter_enable;
+    uint64_t pmu_cycle_counter_count;
+
+    size_t max_events_;
+    Backend backend;
+    int32_t event_id;
     size_t num_events_;
 
     TF_LITE_REMOVE_VIRTUAL_DELETE;
diff --git a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
index 0211414..a5b8e5a 100644
--- a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
+++ b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
@@ -39,17 +39,18 @@
 
 namespace tflite {
 
-LayerByLayerProfiler::LayerByLayerProfiler(size_t max_events, Backend backend, int32_t event_id) :
-    max_events_(max_events), backend_(backend), event_id_(event_id), num_events_(0) {
+LayerByLayerProfiler::LayerByLayerProfiler(const std::vector<uint8_t> &event_config,
+                                           bool pmu_cycle_counter_enable,
+                                           size_t max_events,
+                                           Backend backend,
+                                           int32_t event_id) :
+    pmu_event_config(event_config),
+    pmu_event_count(), pmu_cycle_counter_enable(pmu_cycle_counter_enable), pmu_cycle_counter_count(0),
+    max_events_(max_events), backend(backend), event_id(event_id), num_events_(0) {
 
-    tags_        = std::make_unique<const char *[]>(max_events_);
-    start_ticks_ = std::make_unique<uint64_t[]>(max_events_);
-    end_ticks_   = std::make_unique<uint64_t[]>(max_events_);
-
-    struct ethosu_driver *drv = ethosu_reserve_driver();
-    ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
-    ETHOSU_PMU_CYCCNT_Reset(drv);
-    ethosu_release_driver(drv);
+    tags_        = std::make_unique<const char *[]>(max_events);
+    start_ticks_ = std::make_unique<uint64_t[]>(max_events);
+    end_ticks_   = std::make_unique<uint64_t[]>(max_events);
 }
 
 // NOTE: THIS PROFILER ONLY WORKS ON SYSTEMS WITH 1 NPU
@@ -62,17 +63,44 @@
     tags_[num_events_] = tag;
 
     if (strcmp("ethos-u", tag) == 0) {
-        struct ethosu_driver *ethosu_drv = ethosu_reserve_driver();
-        ETHOSU_PMU_CYCCNT_Reset(ethosu_drv);
-        ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(ethosu_drv, ETHOSU_PMU_NPU_ACTIVE);
-        ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(ethosu_drv, ETHOSU_PMU_NPU_IDLE);
-        start_ticks_[num_events_] = GetCurrentEthosuTicks(ethosu_drv);
-        ethosu_release_driver(ethosu_drv);
+        struct ethosu_driver *drv = ethosu_reserve_driver();
+        size_t numEventCounters   = ETHOSU_PMU_Get_NumEventCounters();
+
+        if (pmu_event_config.size() > numEventCounters) {
+            LOG_WARN("PMU event config list is bigger (%lu) than available PMU event counters (%lu)",
+                     pmu_event_config.size(),
+                     numEventCounters);
+            LOG_WARN("PMU event config list will be truncated");
+            pmu_event_config.resize(numEventCounters);
+        }
+        // Enable PMU
+        ETHOSU_PMU_Enable(drv);
+
+        for (size_t i = 0; i < pmu_event_config.size(); i++) {
+            ETHOSU_PMU_Set_EVTYPER(drv, i, static_cast<ethosu_pmu_event_type>(pmu_event_config[i]));
+        }
+
+        ETHOSU_PMU_CNTR_Enable(drv, (1 << pmu_event_config.size()) - 1);
+        ETHOSU_PMU_EVCNTR_ALL_Reset(drv);
+
+        // Configure the cycle counter
+        if (pmu_cycle_counter_enable) {
+            ETHOSU_PMU_CNTR_Disable(drv, ETHOSU_PMU_CCNT_Msk);
+            ETHOSU_PMU_CYCCNT_Reset(drv);
+
+            ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE);
+            ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE);
+
+            ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
+        }
+        start_ticks_[num_events_] = 0; // Hardware cycle counter has been reset above, thus starts at 0
+        ethosu_release_driver(drv);
     } else {
         start_ticks_[num_events_] = GetCurrentTimeTicks();
     }
 
-    end_ticks_[num_events_] = start_ticks_[num_events_] - 1;
+    end_ticks_[num_events_] =
+        start_ticks_[num_events_]; // NOTE: In case an EndEvent() doesn't trigger, cycles reports as 0
     return num_events_++;
 }
 
@@ -81,19 +109,42 @@
     TFLITE_DCHECK(event_handle < max_events_);
 
     if (strcmp("ethos-u", tags_[event_handle]) == 0) {
-        struct ethosu_driver *ethosu_drv = ethosu_reserve_driver();
-        end_ticks_[event_handle]         = GetCurrentEthosuTicks(ethosu_drv);
-        ethosu_release_driver(ethosu_drv);
+        struct ethosu_driver *drv = ethosu_reserve_driver();
+
+        end_ticks_[event_handle] = GetCurrentEthosuTicks(drv);
+        // Get the cycle count
+        if (pmu_cycle_counter_enable) {
+            pmu_cycle_counter_count = end_ticks_[event_handle];
+        }
+
+        // Save the PMU counter values
+        // NOTE: If multiple ethos-u layers, only the latest will be saved
+        pmu_event_count.resize(pmu_event_config.size());
+        for (size_t i = 0; i < pmu_event_config.size(); i++) {
+            pmu_event_count[i] = ETHOSU_PMU_Get_EVCNTR(drv, i);
+        }
+
+        // Shut down the PMU
+        ETHOSU_PMU_Disable(drv);
+
+        ethosu_release_driver(drv);
     } else {
         end_ticks_[event_handle] = GetCurrentTimeTicks();
     }
 
-    if (backend_ == PRINTF) {
-        LOG("%s : cycle_cnt : %" PRIu64 " cycles\n",
-            tags_[event_handle],
-            end_ticks_[event_handle] - start_ticks_[event_handle]);
+    if (backend == PRINTF) {
+        if (strcmp("ethos-u", tags_[event_handle]) == 0) {
+            for (size_t i = 0; i < pmu_event_count.size(); i++) {
+                LOG("ethos-u : ethosu_pmu_cntr%lu : %u\n", i, pmu_event_count[i]);
+            }
+            LOG("ethos-u : cycle_cnt : %" PRIu64 " cycles\n", pmu_cycle_counter_count);
+        } else {
+            LOG("%s : cycle_cnt : %" PRIu64 " cycles\n",
+                tags_[event_handle],
+                end_ticks_[event_handle] - start_ticks_[event_handle]);
+        }
     } else {
-        EventRecord2(event_id_, (int32_t)event_handle, end_ticks_[event_handle] - start_ticks_[event_handle]);
+        EventRecord2(event_id, (int32_t)event_handle, end_ticks_[event_handle] - start_ticks_[event_handle]);
     }
 }
 
@@ -107,10 +158,18 @@
     return ticks;
 }
 
+uint64_t LayerByLayerProfiler::GetPmuCycleCounterCount() const {
+    return pmu_cycle_counter_count;
+}
+
+const std::vector<uint32_t> &LayerByLayerProfiler::GetPmuEventCount() const {
+    return pmu_event_count;
+}
+
 void LayerByLayerProfiler::Log() const {
 
 #if !defined(TF_LITE_STRIP_ERROR_STRINGS)
-    if (backend_ == PRINTF) {
+    if (backend == PRINTF) {
         for (size_t i = 0; i < num_events_; ++i) {
             uint64_t ticks = end_ticks_[i] - start_ticks_[i];
             LOG("%s took %" PRIu64 " cycles", tags_[i], ticks);