IVGCVSW-4780 Add QoS to AndroidNNDriver

 * Add model priority to ArmnnPreparedModel_1_3
 * Add RequestThread_1_3 to allow execution based on priority
 * Add RETIRE_RATE to Android.mk to be able to configure the retire rate

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: Ic5f4309249b744c2a8f625c986eede381a26028b
diff --git a/1.3/ArmnnDriver.hpp b/1.3/ArmnnDriver.hpp
index 798c438..b6b55fa 100644
--- a/1.3/ArmnnDriver.hpp
+++ b/1.3/ArmnnDriver.hpp
@@ -206,7 +206,8 @@
                                                       model,
                                                       cb,
                                                       model.relaxComputationFloat32toFloat16
-                                                      && m_Options.GetFp16Enabled());
+                                                      && m_Options.GetFp16Enabled(),
+                                                      priority);
     }
 
     Return<void> getSupportedExtensions(getSupportedExtensions_cb cb)
diff --git a/1.3/ArmnnDriverImpl.cpp b/1.3/ArmnnDriverImpl.cpp
index 4b2ff14..6168c9d 100644
--- a/1.3/ArmnnDriverImpl.cpp
+++ b/1.3/ArmnnDriverImpl.cpp
@@ -101,7 +101,8 @@
        const DriverOptions& options,
        const V1_3::Model& model,
        const sp<V1_3::IPreparedModelCallback>& cb,
-       bool float32ToFloat16)
+       bool float32ToFloat16,
+       V1_3::Priority priority)
 {
     ALOGV("ArmnnDriverImpl::prepareArmnnModel_1_3()");
 
@@ -204,7 +205,8 @@
                     runtime.get(),
                     model,
                     options.GetRequestInputsAndOutputsDumpDir(),
-                    options.IsGpuProfilingEnabled()));
+                    options.IsGpuProfilingEnabled(),
+                    priority));
 
     // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if
     // this is enabled) before the first 'real' inference which removes the overhead of the first inference.
diff --git a/1.3/ArmnnDriverImpl.hpp b/1.3/ArmnnDriverImpl.hpp
index 8a665ea..2b39d4e 100644
--- a/1.3/ArmnnDriverImpl.hpp
+++ b/1.3/ArmnnDriverImpl.hpp
@@ -30,7 +30,8 @@
                                                            const DriverOptions& options,
                                                            const V1_3::Model& model,
                                                            const android::sp<V1_3::IPreparedModelCallback>& cb,
-                                                           bool float32ToFloat16 = false);
+                                                           bool float32ToFloat16 = false,
+                                                           V1_3::Priority priority = V1_3::Priority::MEDIUM);
 
     static Return<void> getCapabilities_1_3(const armnn::IRuntimePtr& runtime,
                                             V1_3::IDevice::getCapabilities_1_3_cb cb);
diff --git a/Android.mk b/Android.mk
index bf8bc65..cf37efc 100644
--- a/Android.mk
+++ b/Android.mk
@@ -72,6 +72,9 @@
 ARMNN_INCLUDE_LIBOPENCL := 0
 endif
 
+# Variable to control retire rate of priority queue
+RETIRE_RATE := 3
+
 #######################
 # libarmnn-driver@1.0 #
 #######################
@@ -486,6 +489,9 @@
         -DARMNNREF_ENABLED
 endif # ARMNN_REF_ENABLED == 1
 
+LOCAL_CFLAGS += \
+        -DRETIRE_RATE=$(RETIRE_RATE)
+
 LOCAL_SRC_FILES := \
         1.0/ArmnnDriverImpl.cpp \
         1.0/HalPolicy.cpp \
@@ -504,6 +510,7 @@
         DriverOptions.cpp \
         ModelToINetworkConverter.cpp \
         RequestThread.cpp \
+        RequestThread_1_3.cpp \
         Utils.cpp
 
 LOCAL_STATIC_LIBRARIES := \
diff --git a/ArmnnPreparedModel_1_3.cpp b/ArmnnPreparedModel_1_3.cpp
index 1fb03f4..c7adc6c 100644
--- a/ArmnnPreparedModel_1_3.cpp
+++ b/ArmnnPreparedModel_1_3.cpp
@@ -138,7 +138,7 @@
 {
 
 template<typename HalVersion>
-RequestThread<ArmnnPreparedModel_1_3, HalVersion, CallbackContext_1_3>
+RequestThread_1_3<ArmnnPreparedModel_1_3, HalVersion, CallbackContext_1_3>
         ArmnnPreparedModel_1_3<HalVersion>::m_RequestThread;
 
 template<typename HalVersion>
@@ -164,13 +164,15 @@
                                                            armnn::IRuntime* runtime,
                                                            const V1_3::Model& model,
                                                            const std::string& requestInputsAndOutputsDumpDir,
-                                                           const bool gpuProfilingEnabled)
+                                                           const bool gpuProfilingEnabled,
+                                                           V1_3::Priority priority)
     : m_NetworkId(networkId)
     , m_Runtime(runtime)
     , m_Model(model)
     , m_RequestCount(0)
     , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
     , m_GpuProfilingEnabled(gpuProfilingEnabled)
+    , m_ModelPriority(priority)
 {
     // Enable profiling if required.
     m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
@@ -830,6 +832,12 @@
     return V1_3::ErrorStatus::NONE;
 }
 
+template<typename HalVersion>
+V1_3::Priority ArmnnPreparedModel_1_3<HalVersion>::GetModelPriority()
+{
+    return m_ModelPriority;
+}
+
 #ifdef ARMNN_ANDROID_NN_V1_3
 template class ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>;
 template Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>::ExecuteGraph<CallbackContext_1_3>(
diff --git a/ArmnnPreparedModel_1_3.hpp b/ArmnnPreparedModel_1_3.hpp
index 4dd15c1..5010bbd 100644
--- a/ArmnnPreparedModel_1_3.hpp
+++ b/ArmnnPreparedModel_1_3.hpp
@@ -7,7 +7,7 @@
 
 #include "ArmnnDriver.hpp"
 #include "ArmnnDriverImpl.hpp"
-#include "RequestThread.hpp"
+#include "RequestThread_1_3.hpp"
 #include "ModelToINetworkConverter.hpp"
 
 #include <NeuralNetworks.h>
@@ -50,7 +50,8 @@
                            armnn::IRuntime* runtime,
                            const HalModel& model,
                            const std::string& requestInputsAndOutputsDumpDir,
-                           const bool gpuProfilingEnabled);
+                           const bool gpuProfilingEnabled,
+                           V1_3::Priority priority = V1_3::Priority::MEDIUM);
 
     virtual ~ArmnnPreparedModel_1_3();
 
@@ -105,6 +106,8 @@
     /// \return false on failure, otherwise true
     bool ExecuteWithDummyInputs();
 
+    V1_3::Priority GetModelPriority();
+
 private:
     Return <V1_3::ErrorStatus> Execute(const V1_3::Request& request,
                                        MeasureTiming measureTiming,
@@ -135,10 +138,11 @@
     V1_3::Model                                                                 m_Model;
     // There must be a single RequestThread for all ArmnnPreparedModel objects to ensure serial execution of workloads
     // It is specific to this class, so it is declared as static here
-    static RequestThread<ArmnnPreparedModel_1_3, HalVersion, CallbackContext_1_3> m_RequestThread;
+    static RequestThread_1_3<ArmnnPreparedModel_1_3, HalVersion, CallbackContext_1_3> m_RequestThread;
     uint32_t                                                                    m_RequestCount;
     const std::string&                                                          m_RequestInputsAndOutputsDumpDir;
     const bool                                                                  m_GpuProfilingEnabled;
+    V1_3::Priority                                                              m_ModelPriority;
 };
 
 }
diff --git a/RequestThread.cpp b/RequestThread.cpp
index a177b1a..927af92 100644
--- a/RequestThread.cpp
+++ b/RequestThread.cpp
@@ -161,7 +161,6 @@
 template class RequestThread<ArmnnPreparedModel, hal_1_2::HalPolicy, CallbackContext_1_0>;
 template class RequestThread<ArmnnPreparedModel, hal_1_3::HalPolicy, CallbackContext_1_0>;
 template class RequestThread<ArmnnPreparedModel_1_2, hal_1_2::HalPolicy, CallbackContext_1_2>;
-template class RequestThread<ArmnnPreparedModel_1_3, hal_1_3::HalPolicy, CallbackContext_1_3>;
 #endif
 
 } // namespace armnn_driver
diff --git a/RequestThread_1_3.cpp b/RequestThread_1_3.cpp
new file mode 100644
index 0000000..59fa70e
--- /dev/null
+++ b/RequestThread_1_3.cpp
@@ -0,0 +1,193 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#define LOG_TAG "ArmnnDriver"
+
+#include "RequestThread_1_3.hpp"
+
+#include "ArmnnPreparedModel_1_3.hpp"
+
+#include <armnn/utility/Assert.hpp>
+
+#include <log/log.h>
+
+using namespace android;
+
+namespace armnn_driver
+{
+
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename CallbackContext>
+RequestThread_1_3<PreparedModel, HalVersion, CallbackContext>::RequestThread_1_3()
+{
+    ALOGV("RequestThread_1_3::RequestThread_1_3()");
+    m_Thread = std::make_unique<std::thread>(&RequestThread_1_3::Process, this);
+}
+
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename CallbackContext>
+RequestThread_1_3<PreparedModel, HalVersion, CallbackContext>::~RequestThread_1_3()
+{
+    ALOGV("RequestThread_1_3::~RequestThread_1_3()");
+
+    try
+    {
+        // Coverity fix: The following code may throw an exception of type std::length_error.
+
+        // This code is meant to to terminate the inner thread gracefully by posting an EXIT message
+        // to the thread's message queue. However, according to Coverity, this code could throw an exception and fail.
+        // Since only one static instance of RequestThread is used in the driver (in ArmnnPreparedModel),
+        // this destructor is called only when the application has been closed, which means that
+        // the inner thread will be terminated anyway, although abruptly, in the event that the destructor code throws.
+        // Wrapping the destructor's code with a try-catch block simply fixes the Coverity bug.
+
+        // Post an EXIT message to the thread
+        std::shared_ptr<AsyncExecuteData> nulldata(nullptr);
+        auto pMsg = std::make_shared<ThreadMsg>(ThreadMsgType::EXIT, nulldata);
+        PostMsg(pMsg);
+        // Wait for the thread to terminate, it is deleted automatically
+        m_Thread->join();
+    }
+    catch (const std::exception&) { } // Swallow any exception.
+}
+
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename CallbackContext>
+void RequestThread_1_3<PreparedModel, HalVersion, CallbackContext>::PostMsg(PreparedModel<HalVersion>* model,
+        std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& memPools,
+        std::shared_ptr<armnn::InputTensors>& inputTensors,
+        std::shared_ptr<armnn::OutputTensors>& outputTensors,
+        CallbackContext callbackContext)
+{
+    ALOGV("RequestThread_1_3::PostMsg(...)");
+    auto data = std::make_shared<AsyncExecuteData>(model,
+                                                   memPools,
+                                                   inputTensors,
+                                                   outputTensors,
+                                                   callbackContext);
+    auto pMsg = std::make_shared<ThreadMsg>(ThreadMsgType::REQUEST, data);
+    PostMsg(pMsg, model->GetModelPriority());
+}
+
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename CallbackContext>
+void RequestThread_1_3<PreparedModel, HalVersion, CallbackContext>::PostMsg(std::shared_ptr<ThreadMsg>& pMsg,
+                                                                        V1_3::Priority priority)
+{
+    ALOGV("RequestThread_1_3::PostMsg(pMsg)");
+    // Add a message to the queue and notify the request thread
+    std::unique_lock<std::mutex> lock(m_Mutex);
+    switch (priority) {
+        case V1_3::Priority::HIGH:
+            m_HighPriorityQueue.push(pMsg);
+            break;
+        case V1_3::Priority::LOW:
+            m_LowPriorityQueue.push(pMsg);
+            break;
+        case V1_3::Priority::MEDIUM:
+        default:
+            m_MediumPriorityQueue.push(pMsg);
+    }
+    m_Cv.notify_one();
+}
+
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename CallbackContext>
+void RequestThread_1_3<PreparedModel, HalVersion, CallbackContext>::Process()
+{
+    ALOGV("RequestThread_1_3::Process()");
+    int retireRate = RETIRE_RATE;
+    int highPriorityCount = 0;
+    int mediumPriorityCount = 0;
+    while (true)
+    {
+        std::shared_ptr<ThreadMsg> pMsg(nullptr);
+        {
+            // Wait for a message to be added to the queue
+            // This is in a separate scope to minimise the lifetime of the lock
+            std::unique_lock<std::mutex> lock(m_Mutex);
+            while (m_HighPriorityQueue.empty() && m_MediumPriorityQueue.empty() && m_LowPriorityQueue.empty())
+            {
+                m_Cv.wait(lock);
+            }
+            // Get the message to process from the front of each queue based on priority from high to low
+            // Get high priority first if it does not exceed the retire rate
+            if (!m_HighPriorityQueue.empty() && highPriorityCount < retireRate)
+            {
+                pMsg = m_HighPriorityQueue.front();
+                m_HighPriorityQueue.pop();
+                highPriorityCount += 1;
+            }
+            // If high priority queue is empty or the count exceeds the retire rate, get medium priority message
+            else if (!m_MediumPriorityQueue.empty() && mediumPriorityCount < retireRate)
+            {
+                pMsg = m_MediumPriorityQueue.front();
+                m_MediumPriorityQueue.pop();
+                mediumPriorityCount += 1;
+                // Reset high priority count
+                highPriorityCount = 0;
+            }
+            // If medium priority queue is empty or the count exceeds the retire rate, get low priority message
+            else if (!m_LowPriorityQueue.empty())
+            {
+                pMsg = m_LowPriorityQueue.front();
+                m_LowPriorityQueue.pop();
+                // Reset high and medium priority count
+                highPriorityCount = 0;
+                mediumPriorityCount = 0;
+            }
+            else
+            {
+                // Reset high and medium priority count
+                highPriorityCount = 0;
+                mediumPriorityCount = 0;
+                continue;
+            }
+        }
+
+        switch (pMsg->type)
+        {
+            case ThreadMsgType::REQUEST:
+            {
+                ALOGV("RequestThread_1_3::Process() - request");
+                // invoke the asynchronous execution method
+                PreparedModel<HalVersion>* model = pMsg->data->m_Model;
+                model->ExecuteGraph(pMsg->data->m_MemPools,
+                                    *(pMsg->data->m_InputTensors),
+                                    *(pMsg->data->m_OutputTensors),
+                                    pMsg->data->m_CallbackContext);
+                break;
+            }
+
+            case ThreadMsgType::EXIT:
+            {
+                ALOGV("RequestThread_1_3::Process() - exit");
+                // delete all remaining messages (there should not be any)
+                std::unique_lock<std::mutex> lock(m_Mutex);
+                while (!m_HighPriorityQueue.empty())
+                {
+                    m_HighPriorityQueue.pop();
+                }
+                while (!m_MediumPriorityQueue.empty())
+                {
+                    m_MediumPriorityQueue.pop();
+                }
+                while (!m_LowPriorityQueue.empty())
+                {
+                    m_LowPriorityQueue.pop();
+                }
+                return;
+            }
+
+            default:
+                // this should be unreachable
+                ALOGE("RequestThread_1_3::Process() - invalid message type");
+                ARMNN_ASSERT_MSG(false, "ArmNN: RequestThread_1_3: invalid message type");
+        }
+    }
+}
+
+///
+/// Class template specializations
+///
+
+template class RequestThread_1_3<ArmnnPreparedModel_1_3, hal_1_3::HalPolicy, CallbackContext_1_3>;
+
+} // namespace armnn_driver
diff --git a/RequestThread_1_3.hpp b/RequestThread_1_3.hpp
new file mode 100644
index 0000000..c8abc5e
--- /dev/null
+++ b/RequestThread_1_3.hpp
@@ -0,0 +1,106 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <queue>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+
+#include "ArmnnDriver.hpp"
+#include "ArmnnDriverImpl.hpp"
+
+#include <CpuExecutor.h>
+#include <armnn/ArmNN.hpp>
+
+namespace armnn_driver
+{
+using TimePoint = std::chrono::steady_clock::time_point;
+
+template<template <typename HalVersion> class PreparedModel, typename HalVersion, typename CallbackContext>
+class RequestThread_1_3
+{
+public:
+    /// Constructor creates the thread
+    RequestThread_1_3();
+
+    /// Destructor terminates the thread
+    ~RequestThread_1_3();
+
+    /// Add a message to the thread queue.
+    /// @param[in] model pointer to the prepared model handling the request
+    /// @param[in] memPools pointer to the memory pools vector for the tensors
+    /// @param[in] inputTensors pointer to the input tensors for the request
+    /// @param[in] outputTensors pointer to the output tensors for the request
+    /// @param[in] callback the android notification callback
+    void PostMsg(PreparedModel<HalVersion>* model,
+                 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& memPools,
+                 std::shared_ptr<armnn::InputTensors>& inputTensors,
+                 std::shared_ptr<armnn::OutputTensors>& outputTensors,
+                 CallbackContext callbackContext);
+
+private:
+    RequestThread_1_3(const RequestThread_1_3&) = delete;
+    RequestThread_1_3& operator=(const RequestThread_1_3&) = delete;
+
+    /// storage for a prepared model and args for the asyncExecute call
+    struct AsyncExecuteData
+    {
+        AsyncExecuteData(PreparedModel<HalVersion>* model,
+                         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& memPools,
+                         std::shared_ptr<armnn::InputTensors>& inputTensors,
+                         std::shared_ptr<armnn::OutputTensors>& outputTensors,
+                         CallbackContext callbackContext)
+            : m_Model(model)
+            , m_MemPools(memPools)
+            , m_InputTensors(inputTensors)
+            , m_OutputTensors(outputTensors)
+            , m_CallbackContext(callbackContext)
+        {
+        }
+
+        PreparedModel<HalVersion>* m_Model;
+        std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> m_MemPools;
+        std::shared_ptr<armnn::InputTensors> m_InputTensors;
+        std::shared_ptr<armnn::OutputTensors> m_OutputTensors;
+        CallbackContext m_CallbackContext;
+    };
+    enum class ThreadMsgType
+    {
+        EXIT,                   // exit the thread
+        REQUEST                 // user request to process
+    };
+
+    /// storage for the thread message type and data
+    struct ThreadMsg
+    {
+        ThreadMsg(ThreadMsgType msgType,
+                  std::shared_ptr<AsyncExecuteData>& msgData)
+            : type(msgType)
+            , data(msgData)
+        {
+        }
+
+        ThreadMsgType type;
+        std::shared_ptr<AsyncExecuteData> data;
+    };
+
+    /// Add a prepared thread message to the thread queue.
+    /// @param[in] threadMsg the message to add to the queue
+    void PostMsg(std::shared_ptr<ThreadMsg>& pThreadMsg, V1_3::Priority priority = V1_3::Priority::MEDIUM);
+
+    /// Entry point for the request thread
+    void Process();
+
+    std::unique_ptr<std::thread> m_Thread;
+    std::queue<std::shared_ptr<ThreadMsg>> m_HighPriorityQueue;
+    std::queue<std::shared_ptr<ThreadMsg>> m_MediumPriorityQueue;
+    std::queue<std::shared_ptr<ThreadMsg>> m_LowPriorityQueue;
+    std::mutex m_Mutex;
+    std::condition_variable m_Cv;
+};
+
+} // namespace armnn_driver