IVGCVSW-3463 Fix Hal 1.2 Dynamic Output Shape VTS test failures

 *Updating ArmnnPreparedModel_1_2 to work with output shapes and timing.

Change-Id: I06c4ecaf1e2c36ef77a0731ece4885fc3997cd3b
Signed-off-by: Sadik Armagan <sadik.armagan@arm.com>
Signed-off-by: Mike Kelly <mike.kelly@arm.com>
diff --git a/ArmnnPreparedModel.cpp b/ArmnnPreparedModel.cpp
index 3256836..462970a 100644
--- a/ArmnnPreparedModel.cpp
+++ b/ArmnnPreparedModel.cpp
@@ -87,9 +87,8 @@
 
 namespace armnn_driver
 {
-
 template<typename HalVersion>
-RequestThread<ArmnnPreparedModel, HalVersion> ArmnnPreparedModel<HalVersion>::m_RequestThread;
+RequestThread<ArmnnPreparedModel, HalVersion, ArmnnCallback_1_0> ArmnnPreparedModel<HalVersion>::m_RequestThread;
 
 template<typename HalVersion>
 template <typename TensorBindingCollection>
@@ -218,10 +217,17 @@
     }
 
     ALOGV("ArmnnPreparedModel::execute(...) before PostMsg");
-    // post the request for asynchronous execution
-    m_RequestThread.PostMsg(this, pMemPools, pInputTensors, pOutputTensors, callback);
-    ALOGV("ArmnnPreparedModel::execute(...) after PostMsg");
 
+    auto cb = [callback](ErrorStatus errorStatus, std::string callingFunction)
+    {
+        NotifyCallbackAndCheck(callback, errorStatus, callingFunction);
+    };
+
+    ArmnnCallback_1_0 armnnCb;
+    armnnCb.callback = cb;
+    // post the request for asynchronous execution
+    m_RequestThread.PostMsg(this, pMemPools, pInputTensors, pOutputTensors, armnnCb);
+    ALOGV("ArmnnPreparedModel::execute(...) after PostMsg");
     return ErrorStatus::NONE; // successfully queued
 }
 
@@ -230,7 +236,7 @@
         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
         std::shared_ptr<armnn::InputTensors>& pInputTensors,
         std::shared_ptr<armnn::OutputTensors>& pOutputTensors,
-        const ::android::sp<V1_0::IExecutionCallback>& callback)
+        ArmnnCallback_1_0 cb)
 {
     ALOGV("ArmnnPreparedModel::ExecuteGraph(...)");
 
@@ -243,14 +249,14 @@
         if (status != armnn::Status::Success)
         {
             ALOGW("EnqueueWorkload failed");
-            NotifyCallbackAndCheck(callback, ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel::ExecuteGraph");
+            cb.callback(ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel::ExecuteGraph");
             return;
         }
     }
     catch (armnn::Exception& e)
     {
         ALOGW("armnn::Exception caught from EnqueueWorkload: %s", e.what());
-        NotifyCallbackAndCheck(callback, ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel::ExecuteGraph");
+        cb.callback(ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel::ExecuteGraph");
         return;
     }
 
@@ -264,7 +270,7 @@
         pool.update();
     }
 
-    NotifyCallbackAndCheck(callback, ErrorStatus::NONE, "ExecuteGraph");
+    cb.callback(ErrorStatus::NONE, "ExecuteGraph");
 }
 
 template<typename HalVersion>
diff --git a/ArmnnPreparedModel.hpp b/ArmnnPreparedModel.hpp
index 275af31..33be972 100644
--- a/ArmnnPreparedModel.hpp
+++ b/ArmnnPreparedModel.hpp
@@ -17,6 +17,12 @@
 
 namespace armnn_driver
 {
+using armnnExecuteCallback_1_0 = std::function<void(V1_0::ErrorStatus status, std::string callingFunction)>;
+
+struct ArmnnCallback_1_0
+{
+    armnnExecuteCallback_1_0 callback;
+};
 
 template <typename HalVersion>
 class ArmnnPreparedModel : public V1_0::IPreparedModel
@@ -39,7 +45,7 @@
     void ExecuteGraph(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
                       std::shared_ptr<armnn::InputTensors>& pInputTensors,
                       std::shared_ptr<armnn::OutputTensors>& pOutputTensors,
-                      const ::android::sp<V1_0::IExecutionCallback>& callback);
+                      ArmnnCallback_1_0 callback);
 
     /// Executes this model with dummy inputs (e.g. all zeroes).
     /// \return false on failure, otherwise true
@@ -49,15 +55,15 @@
     template <typename TensorBindingCollection>
     void DumpTensorsIfRequired(char const* tensorNamePrefix, const TensorBindingCollection& tensorBindings);
 
-    armnn::NetworkId                                     m_NetworkId;
-    armnn::IRuntime*                                     m_Runtime;
-    HalModel                                             m_Model;
+    armnn::NetworkId                                                        m_NetworkId;
+    armnn::IRuntime*                                                        m_Runtime;
+    HalModel                                                                m_Model;
     // There must be a single RequestThread for all ArmnnPreparedModel objects to ensure serial execution of workloads
     // It is specific to this class, so it is declared as static here
-    static RequestThread<ArmnnPreparedModel, HalVersion> m_RequestThread;
-    uint32_t                                             m_RequestCount;
-    const std::string&                                   m_RequestInputsAndOutputsDumpDir;
-    const bool                                           m_GpuProfilingEnabled;
+    static RequestThread<ArmnnPreparedModel, HalVersion, ArmnnCallback_1_0> m_RequestThread;
+    uint32_t                                                                m_RequestCount;
+    const std::string&                                                      m_RequestInputsAndOutputsDumpDir;
+    const bool                                                              m_GpuProfilingEnabled;
 };
 
 }
diff --git a/ArmnnPreparedModel_1_2.cpp b/ArmnnPreparedModel_1_2.cpp
index f6b4462..a7997c7 100644
--- a/ArmnnPreparedModel_1_2.cpp
+++ b/ArmnnPreparedModel_1_2.cpp
@@ -37,7 +37,10 @@
                                       endPoint - startPoint).count());
 }
 
-void NotifyCallbackAndCheck(const ::android::sp<V1_0::IExecutionCallback>& callback, ErrorStatus errorStatus,
+void NotifyCallbackAndCheck(const ::android::sp<V1_0::IExecutionCallback>& callback,
+                            ErrorStatus errorStatus,
+                            std::vector<OutputShape>,
+                            const Timing,
                             std::string callingFunction)
 {
     Return<void> returned = callback->notify(errorStatus);
@@ -49,10 +52,13 @@
     }
 }
 
-void NotifyCallbackAndCheck(const ::android::sp<V1_2::IExecutionCallback>& callback, ErrorStatus errorStatus,
+void NotifyCallbackAndCheck(const ::android::sp<V1_2::IExecutionCallback>& callback,
+                            ErrorStatus errorStatus,
+                            std::vector<OutputShape> outputShapes,
+                            const Timing timing,
                             std::string callingFunction)
 {
-    Return<void> returned = callback->notify(errorStatus);
+    Return<void> returned = callback->notify_1_2(errorStatus, outputShapes, timing);
     // This check is required, if the callback fails and it isn't checked it will bring down the service
     if (!returned.isOk())
     {
@@ -111,7 +117,8 @@
 {
 
 template<typename HalVersion>
-RequestThread<ArmnnPreparedModel_1_2, HalVersion> ArmnnPreparedModel_1_2<HalVersion>::m_RequestThread;
+RequestThread<ArmnnPreparedModel_1_2, HalVersion, ArmnnCallback_1_2>
+        ArmnnPreparedModel_1_2<HalVersion>::m_RequestThread;
 
 template<typename HalVersion>
 template<typename TensorBindingCollection>
@@ -165,15 +172,43 @@
 Return <ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::execute(const Request& request,
         const ::android::sp<V1_0::IExecutionCallback>& callback)
 {
-    return Execute<V1_0::IExecutionCallback>(request, callback);
+    if (callback.get() == nullptr)
+    {
+        ALOGE("ArmnnPreparedModel_1_2::execute invalid callback passed");
+        return ErrorStatus::INVALID_ARGUMENT;
+    }
+
+    auto cb = [callback](ErrorStatus errorStatus,
+                         std::vector<OutputShape> outputShapes,
+                         const Timing& timing,
+                         std::string callingFunction)
+    {
+        NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
+    };
+
+    return Execute(request, MeasureTiming::NO, cb);
 }
 
 template<typename HalVersion>
 Return <ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::execute_1_2(const Request& request,
-                                                                     MeasureTiming,
+                                                                     MeasureTiming measureTiming,
                                                                      const sp<V1_2::IExecutionCallback>& callback)
 {
-    return Execute<V1_2::IExecutionCallback>(request, callback);
+    if (callback.get() == nullptr)
+    {
+        ALOGE("ArmnnPreparedModel_1_2::execute_1_2 invalid callback passed");
+        return ErrorStatus::INVALID_ARGUMENT;
+    }
+
+    auto cb = [callback](ErrorStatus errorStatus,
+                         std::vector<OutputShape> outputShapes,
+                         const Timing& timing,
+                         std::string callingFunction)
+    {
+        NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
+    };
+
+    return Execute(request, measureTiming, cb);
 }
 
 template<typename HalVersion>
@@ -217,8 +252,8 @@
         cb(ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming);
         return Void();
     }
+    std::vector<OutputShape> outputShapes(request.outputs.size());
 
-    // add the inputs and outputs with their data
     try
     {
         pInputTensors->reserve(request.inputs.size());
@@ -238,8 +273,8 @@
 
             pInputTensors->emplace_back(i, inputTensor);
         }
-
         pOutputTensors->reserve(request.outputs.size());
+
         for (unsigned int i = 0; i < request.outputs.size(); i++)
         {
             const auto& outputArg = request.outputs[i];
@@ -253,6 +288,28 @@
                 cb(ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming);
                 return Void();
             }
+            const size_t outputSize = outputTensorInfo.GetNumBytes();
+            const size_t bufferSize = pMemPools->at(outputArg.location.poolIndex).getHidlMemory().size();
+
+            hidl_vec<uint32_t> dimensions;
+
+            armnn::TensorShape tensorShape = outputTensorInfo.GetShape();
+            const unsigned int numDims = tensorShape.GetNumDimensions();
+            dimensions.resize(numDims);
+
+            for (unsigned int outputIdx = 0u; outputIdx < numDims; ++outputIdx)
+            {
+                dimensions[outputIdx] = tensorShape[outputIdx];
+            }
+            outputShapes[i].dimensions = dimensions;
+            outputShapes[i].isSufficient = bufferSize >= outputSize;
+
+            if (bufferSize < outputSize)
+            {
+                ALOGW("ArmnnPreparedModel_1_2::Execute failed");
+                cb(ErrorStatus::OUTPUT_INSUFFICIENT_SIZE, outputShapes, g_NoTiming);
+                return Void();
+            }
 
             pOutputTensors->emplace_back(i, outputTensor);
         }
@@ -314,15 +371,82 @@
         timing.timeInDriver = MicrosecondsDuration(driverEnd, driverStart);
         ALOGV("ArmnnPreparedModel_1_2::executeSynchronously timing Device = %lu Driver = %lu", timing.timeOnDevice,
                 timing.timeInDriver);
-        cb(ErrorStatus::NONE, {}, timing);
+        cb(ErrorStatus::NONE, outputShapes, timing);
     }
     else
     {
-        cb(ErrorStatus::NONE, {}, g_NoTiming);
+        cb(ErrorStatus::NONE, outputShapes, g_NoTiming);
     }
     return Void();
 }
 
+class ArmnnBurstExecutorWithCache : public ExecutionBurstServer::IBurstExecutorWithCache {
+public:
+    ArmnnBurstExecutorWithCache(IPreparedModel* preparedModel)
+        : m_PreparedModel(preparedModel)
+    {}
+
+    bool isCacheEntryPresent(int32_t slot) const override
+    {
+        const auto it = m_MemoryCache.find(slot);
+        return (it != m_MemoryCache.end()) && it->second.valid();
+    }
+
+    void addCacheEntry(const hidl_memory& memory, int32_t slot) override
+    {
+        m_MemoryCache[slot] = memory;
+    }
+
+    void removeCacheEntry(int32_t slot) override
+    {
+        m_MemoryCache.erase(slot);
+    }
+
+    std::tuple<ErrorStatus, hidl_vec<OutputShape>, Timing> execute(
+            const Request& request, const std::vector<int32_t>& slots,
+            MeasureTiming measure) override
+    {
+        ALOGV("ArmnnPreparedModel_1_2::BurstExecutorWithCache::execute");
+        hidl_vec<hidl_memory> pools(slots.size());
+
+        std::transform(slots.begin(), slots.end(), pools.begin(), [this](int32_t slot)
+        {
+            return m_MemoryCache[slot];
+        });
+
+        Request fullRequest = request;
+        fullRequest.pools = std::move(pools);
+
+        // Setup Callback
+        ErrorStatus returnedStatus = ErrorStatus::GENERAL_FAILURE;
+        hidl_vec<OutputShape> returnedOutputShapes;
+        Timing returnedTiming;
+        auto cb = [&returnedStatus, &returnedOutputShapes, &returnedTiming](ErrorStatus status,
+                                                                            const hidl_vec<OutputShape>& outputShapes,
+                                                                            const Timing& timing)
+        {
+            returnedStatus = status;
+            returnedOutputShapes = outputShapes;
+            returnedTiming = timing;
+        };
+
+        // Execute
+        ALOGV("ArmnnPreparedModel_1_2::BurstExecutorWithCache executing");
+        const Return<void> ret = m_PreparedModel->executeSynchronously(fullRequest, measure, cb);
+
+        if (!ret.isOk() || returnedStatus != ErrorStatus::NONE)
+        {
+            ALOGE("ArmnnPreparedModel_1_2::BurstExecutorWithCache::error executing");
+        }
+        return std::make_tuple(returnedStatus, std::move(returnedOutputShapes), returnedTiming);
+    }
+
+private:
+    IPreparedModel* const m_PreparedModel;
+    std::map<int, hidl_memory> m_MemoryCache;
+};
+
+
 template<typename HalVersion>
 Return<void> ArmnnPreparedModel_1_2<HalVersion>::configureExecutionBurst(
         const sp<V1_2::IBurstCallback>& callback,
@@ -331,7 +455,12 @@
         V1_2::IPreparedModel::configureExecutionBurst_cb cb)
 {
     ALOGV("ArmnnPreparedModel_1_2::configureExecutionBurst");
-    const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(callback, requestChannel, resultChannel, this);
+    const std::shared_ptr<ArmnnBurstExecutorWithCache> executorWithCache =
+            std::make_shared<ArmnnBurstExecutorWithCache>(this);
+    const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(callback,
+                                                                       requestChannel,
+                                                                       resultChannel,
+                                                                       executorWithCache);
 
     if (burst == nullptr)
     {
@@ -349,27 +478,64 @@
         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
         std::shared_ptr<armnn::InputTensors>& pInputTensors,
         std::shared_ptr<armnn::OutputTensors>& pOutputTensors,
-        const ::android::sp<V1_0::IExecutionCallback>& callback)
+        ArmnnCallback_1_2 cb)
 {
     ALOGV("ArmnnPreparedModel_1_2::ExecuteGraph(...)");
 
+    TimePoint driverEnd, deviceStart, deviceEnd;
+
     DumpTensorsIfRequired("Input", *pInputTensors);
 
+    std::vector<std::pair<int, armnn::Tensor> > outputTensors = *pOutputTensors.get();
+    std::vector<OutputShape> outputShapes(outputTensors.size());
+
+    for (unsigned int i = 0; i < outputTensors.size(); i++)
+    {
+        std::pair<int, armnn::Tensor> outputTensorPair = outputTensors[i];
+        const armnn::Tensor outputTensor = outputTensorPair.second;
+        const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
+
+        hidl_vec<uint32_t> dimensions;
+
+        armnn::TensorShape tensorShape = outputTensorInfo.GetShape();
+        const unsigned int numDims = tensorShape.GetNumDimensions();
+        dimensions.resize(numDims);
+
+        for (unsigned int outputIdx = 0u; outputIdx < numDims; ++outputIdx)
+        {
+            dimensions[outputIdx] = tensorShape[outputIdx];
+        }
+        outputShapes[i].dimensions = dimensions;
+        outputShapes[i].isSufficient = true;
+    }
+
     // run it
     try
     {
+        if (cb.measureTiming == MeasureTiming::YES)
+        {
+            deviceStart = Now();
+        }
+
         armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, *pInputTensors, *pOutputTensors);
+
+        if (cb.measureTiming == MeasureTiming::YES)
+        {
+            deviceEnd = Now();
+        }
         if (status != armnn::Status::Success)
         {
             ALOGW("EnqueueWorkload failed");
-            NotifyCallbackAndCheck(callback, ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel_1_2::ExecuteGraph");
+            cb.callback(ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming,
+                    "ArmnnPreparedModel_1_2::ExecuteGraph");
             return;
         }
     }
     catch (armnn::Exception& e)
     {
         ALOGW("armnn::Exception caught from EnqueueWorkload: %s", e.what());
-        NotifyCallbackAndCheck(callback, ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel_1_2::ExecuteGraph");
+        cb.callback(ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming,
+                "ArmnnPreparedModel_1_2::ExecuteGraph");
         return;
     }
 
@@ -383,7 +549,16 @@
         pool.update();
     }
 
-    NotifyCallbackAndCheck(callback, ErrorStatus::NONE, "ExecuteGraph");
+    if (cb.measureTiming == MeasureTiming::YES)
+    {
+        driverEnd = Now();
+        Timing timing;
+        timing.timeOnDevice = MicrosecondsDuration(deviceEnd, deviceStart);
+        timing.timeInDriver = MicrosecondsDuration(driverEnd, cb.driverStart);
+        cb.callback(ErrorStatus::NONE, outputShapes, timing, "ExecuteGraph");
+    } else {
+        cb.callback(ErrorStatus::NONE, outputShapes, g_NoTiming, "ExecuteGraph");
+    }
 }
 
 template<typename HalVersion>
@@ -428,28 +603,29 @@
 }
 
 template<typename HalVersion>
-template<typename ExecutionCallback>
 Return <ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::Execute(const Request& request,
-                                                                 const sp<ExecutionCallback>& callback)
+                                                                 MeasureTiming measureTiming,
+                                                                 armnnExecuteCallback_1_2 callback)
 {
+    TimePoint driverStart;
+
+    if (measureTiming == MeasureTiming::YES)
+    {
+        driverStart = Now();
+    }
+
     ALOGV("ArmnnPreparedModel_1_2::execute(): %s", GetModelSummary(m_Model).c_str());
     m_RequestCount++;
 
-    if (callback.get() == nullptr)
-    {
-        ALOGE("ArmnnPreparedModel_1_2::execute invalid callback passed");
-        return ErrorStatus::INVALID_ARGUMENT;
-    }
-
     if (!android::nn::validateRequest(request, m_Model))
     {
-        NotifyCallbackAndCheck(callback, ErrorStatus::INVALID_ARGUMENT, "ArmnnPreparedModel_1_2::execute");
+        callback(ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
         return ErrorStatus::INVALID_ARGUMENT;
     }
 
     if (!m_RequestInputsAndOutputsDumpDir.empty())
     {
-        ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(callback.get()));
+        ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(&callback));
     }
 
     // allocate the tensors on the heap, as they are passed to the request thread
@@ -462,7 +638,7 @@
 
     if (!setRunTimePoolInfosFromHidlMemories(pMemPools.get(), request.pools))
     {
-        NotifyCallbackAndCheck(callback, ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel_1_2::execute");
+        callback(ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
         return ErrorStatus::GENERAL_FAILURE;
     }
 
@@ -480,8 +656,7 @@
             if (inputTensor.GetMemoryArea() == nullptr)
             {
                 ALOGE("Cannot execute request. Error converting request input %u to tensor", i);
-                NotifyCallbackAndCheck(callback, ErrorStatus::GENERAL_FAILURE,
-                                       "ArmnnPreparedModel_1_2::execute");
+                callback(ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
                 return ErrorStatus::GENERAL_FAILURE;
             }
 
@@ -489,6 +664,8 @@
         }
 
         pOutputTensors->reserve(request.outputs.size());
+        std::vector<OutputShape> outputShapes(request.outputs.size());
+
         for (unsigned int i = 0; i < request.outputs.size(); i++)
         {
             const auto& outputArg = request.outputs[i];
@@ -496,33 +673,58 @@
             const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
             const armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, *pMemPools);
             if (outputTensor.GetMemoryArea() == nullptr)
-
             {
                 ALOGE("Cannot execute request. Error converting request output %u to tensor", i);
-                NotifyCallbackAndCheck(callback, ErrorStatus::GENERAL_FAILURE,
-                                       "ArmnnPreparedModel_1_2::execute");
+                callback(ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
                 return ErrorStatus::GENERAL_FAILURE;
             }
 
+            const size_t outputSize = outputTensorInfo.GetNumBytes();
+            const size_t bufferSize = pMemPools->at(outputArg.location.poolIndex).getHidlMemory().size();
             pOutputTensors->emplace_back(i, outputTensor);
+
+            hidl_vec<uint32_t> dimensions;
+
+            armnn::TensorShape tensorShape = outputTensorInfo.GetShape();
+            const unsigned int numDims = tensorShape.GetNumDimensions();
+            dimensions.resize(numDims);
+
+            for (unsigned int outputIdx = 0u; outputIdx < numDims; ++outputIdx)
+            {
+                dimensions[outputIdx] = tensorShape[outputIdx];
+            }
+            outputShapes[i].dimensions = dimensions;
+            outputShapes[i].isSufficient = bufferSize >= outputSize;
+
+            if (bufferSize < outputSize)
+            {
+                ALOGW("ArmnnPreparedModel_1_2::Execute failed");
+                callback(ErrorStatus::OUTPUT_INSUFFICIENT_SIZE,
+                         outputShapes,
+                         g_NoTiming,
+                         "ArmnnPreparedModel_1_2::Execute");
+                return ErrorStatus::NONE;
+            }
         }
     }
     catch (armnn::Exception& e)
     {
         ALOGW("armnn::Exception caught while preparing for EnqueueWorkload: %s", e.what());
-        NotifyCallbackAndCheck(callback, ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel_1_2::execute");
+        callback(ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
         return ErrorStatus::GENERAL_FAILURE;
     }
 
     ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg");
     // post the request for asynchronous execution
-    m_RequestThread.PostMsg(this, pMemPools, pInputTensors, pOutputTensors, callback);
+    ArmnnCallback_1_2 armnnCb;
+    armnnCb.callback = callback;
+    armnnCb.measureTiming = measureTiming;
+    armnnCb.driverStart = driverStart;
+    m_RequestThread.PostMsg(this, pMemPools, pInputTensors, pOutputTensors, armnnCb);
     ALOGV("ArmnnPreparedModel_1_2::execute(...) after PostMsg");
-
     return ErrorStatus::NONE;
 }
 
-
 #ifdef ARMNN_ANDROID_NN_V1_2
 template class ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>;
 #endif
diff --git a/ArmnnPreparedModel_1_2.hpp b/ArmnnPreparedModel_1_2.hpp
index 4e883b6..b97895e 100644
--- a/ArmnnPreparedModel_1_2.hpp
+++ b/ArmnnPreparedModel_1_2.hpp
@@ -19,6 +19,18 @@
 namespace armnn_driver
 {
 
+typedef std::function<void(::android::hardware::neuralnetworks::V1_0::ErrorStatus status,
+        std::vector<::android::hardware::neuralnetworks::V1_2::OutputShape> outputShapes,
+        const ::android::hardware::neuralnetworks::V1_2::Timing& timing,
+        std::string callingFunction)> armnnExecuteCallback_1_2;
+
+struct ArmnnCallback_1_2
+{
+    armnnExecuteCallback_1_2 callback;
+    TimePoint driverStart;
+    MeasureTiming measureTiming;
+};
+
 template <typename HalVersion>
 class ArmnnPreparedModel_1_2 : public V1_2::IPreparedModel
 {
@@ -34,7 +46,7 @@
     virtual ~ArmnnPreparedModel_1_2();
 
     virtual Return<ErrorStatus> execute(const Request& request,
-                                        const ::android::sp<V1_0::IExecutionCallback>& callback) override;
+                                        const sp<V1_0::IExecutionCallback>& callback) override;
 
     virtual Return<ErrorStatus> execute_1_2(const Request& request, MeasureTiming measure,
                                             const sp<V1_2::IExecutionCallback>& callback) override;
@@ -53,28 +65,29 @@
     void ExecuteGraph(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
                       std::shared_ptr<armnn::InputTensors>& pInputTensors,
                       std::shared_ptr<armnn::OutputTensors>& pOutputTensors,
-                      const ::android::sp<V1_0::IExecutionCallback>& callback);
+                      ArmnnCallback_1_2 callbackDescriptor);
 
     /// Executes this model with dummy inputs (e.g. all zeroes).
     /// \return false on failure, otherwise true
     bool ExecuteWithDummyInputs();
 
 private:
-    template <typename ExecutionCallback>
-    Return <ErrorStatus> Execute(const Request &request, const sp <ExecutionCallback> &callback);
+    Return <ErrorStatus> Execute(const Request& request,
+                                 MeasureTiming measureTiming,
+                                 armnnExecuteCallback_1_2 callback);
 
     template <typename TensorBindingCollection>
     void DumpTensorsIfRequired(char const* tensorNamePrefix, const TensorBindingCollection& tensorBindings);
 
-    armnn::NetworkId                                         m_NetworkId;
-    armnn::IRuntime*                                         m_Runtime;
-    V1_2::Model                                              m_Model;
+    armnn::NetworkId                                                            m_NetworkId;
+    armnn::IRuntime*                                                            m_Runtime;
+    V1_2::Model                                                                 m_Model;
     // There must be a single RequestThread for all ArmnnPreparedModel objects to ensure serial execution of workloads
     // It is specific to this class, so it is declared as static here
-    static RequestThread<ArmnnPreparedModel_1_2, HalVersion> m_RequestThread;
-    uint32_t                                                 m_RequestCount;
-    const std::string&                                       m_RequestInputsAndOutputsDumpDir;
-    const bool                                               m_GpuProfilingEnabled;
+    static RequestThread<ArmnnPreparedModel_1_2, HalVersion, ArmnnCallback_1_2> m_RequestThread;
+    uint32_t                                                                    m_RequestCount;
+    const std::string&                                                          m_RequestInputsAndOutputsDumpDir;
+    const bool                                                                  m_GpuProfilingEnabled;
 };
 
 }
diff --git a/RequestThread.cpp b/RequestThread.cpp
index 4b64603..052c5c1 100644
--- a/RequestThread.cpp
+++ b/RequestThread.cpp
@@ -21,15 +21,15 @@
 namespace armnn_driver
 {
 
-template <template <typename HalVersion> class PreparedModel, typename HalVersion>
-RequestThread<PreparedModel, HalVersion>::RequestThread()
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename Callback>
+RequestThread<PreparedModel, HalVersion, Callback>::RequestThread()
 {
     ALOGV("RequestThread::RequestThread()");
     m_Thread = std::make_unique<std::thread>(&RequestThread::Process, this);
 }
 
-template <template <typename HalVersion> class PreparedModel, typename HalVersion>
-RequestThread<PreparedModel, HalVersion>::~RequestThread()
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename Callback>
+RequestThread<PreparedModel, HalVersion, Callback>::~RequestThread()
 {
     ALOGV("RequestThread::~RequestThread()");
 
@@ -54,12 +54,12 @@
     catch (const std::exception&) { } // Swallow any exception.
 }
 
-template <template <typename HalVersion> class PreparedModel, typename HalVersion>
-void RequestThread<PreparedModel, HalVersion>::PostMsg(PreparedModel<HalVersion>* model,
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename Callback>
+void RequestThread<PreparedModel, HalVersion, Callback>::PostMsg(PreparedModel<HalVersion>* model,
         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& memPools,
         std::shared_ptr<armnn::InputTensors>& inputTensors,
         std::shared_ptr<armnn::OutputTensors>& outputTensors,
-        const ::android::sp<V1_0::IExecutionCallback>& callback)
+        Callback callback)
 {
     ALOGV("RequestThread::PostMsg(...)");
     auto data = std::make_shared<AsyncExecuteData>(model,
@@ -71,8 +71,8 @@
     PostMsg(pMsg);
 }
 
-template <template <typename HalVersion> class PreparedModel, typename HalVersion>
-void RequestThread<PreparedModel, HalVersion>::PostMsg(std::shared_ptr<ThreadMsg>& pMsg)
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename Callback>
+void RequestThread<PreparedModel, HalVersion, Callback>::PostMsg(std::shared_ptr<ThreadMsg>& pMsg)
 {
     ALOGV("RequestThread::PostMsg(pMsg)");
     // Add a message to the queue and notify the request thread
@@ -81,8 +81,8 @@
     m_Cv.notify_one();
 }
 
-template <template <typename HalVersion> class PreparedModel, typename HalVersion>
-void RequestThread<PreparedModel, HalVersion>::Process()
+template <template <typename HalVersion> class PreparedModel, typename HalVersion, typename Callback>
+void RequestThread<PreparedModel, HalVersion, Callback>::Process()
 {
     ALOGV("RequestThread::Process()");
     while (true)
@@ -111,7 +111,7 @@
                 model->ExecuteGraph(pMsg->data->m_MemPools,
                                     pMsg->data->m_InputTensors,
                                     pMsg->data->m_OutputTensors,
-                                    pMsg->data->m_callback);
+                                    pMsg->data->m_Callback);
                 break;
             }
 
@@ -139,16 +139,16 @@
 /// Class template specializations
 ///
 
-template class RequestThread<ArmnnPreparedModel, hal_1_0::HalPolicy>;
+template class RequestThread<ArmnnPreparedModel, hal_1_0::HalPolicy, ArmnnCallback_1_0>;
 
 #ifdef ARMNN_ANDROID_NN_V1_1
-template class RequestThread<armnn_driver::ArmnnPreparedModel, hal_1_1::HalPolicy>;
+template class RequestThread<armnn_driver::ArmnnPreparedModel, hal_1_1::HalPolicy, ArmnnCallback_1_0>;
 #endif
 
 #ifdef ARMNN_ANDROID_NN_V1_2
-template class RequestThread<ArmnnPreparedModel, hal_1_1::HalPolicy>;
-template class RequestThread<ArmnnPreparedModel, hal_1_2::HalPolicy>;
-template class RequestThread<ArmnnPreparedModel_1_2, hal_1_2::HalPolicy>;
+template class RequestThread<ArmnnPreparedModel, hal_1_1::HalPolicy, ArmnnCallback_1_0>;
+template class RequestThread<ArmnnPreparedModel, hal_1_2::HalPolicy, ArmnnCallback_1_0>;
+template class RequestThread<ArmnnPreparedModel_1_2, hal_1_2::HalPolicy, ArmnnCallback_1_2>;
 #endif
 
 } // namespace armnn_driver
diff --git a/RequestThread.hpp b/RequestThread.hpp
index dc1b535..253d104 100644
--- a/RequestThread.hpp
+++ b/RequestThread.hpp
@@ -18,8 +18,10 @@
 
 namespace armnn_driver
 {
+using TimePoint = std::chrono::steady_clock::time_point;
+static const TimePoint g_Min = std::chrono::steady_clock::time_point::min();
 
-template<template <typename HalVersion> class PreparedModel, typename HalVersion>
+template<template <typename HalVersion> class PreparedModel, typename HalVersion, typename Callback>
 class RequestThread
 {
 public:
@@ -39,7 +41,7 @@
                  std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& memPools,
                  std::shared_ptr<armnn::InputTensors>& inputTensors,
                  std::shared_ptr<armnn::OutputTensors>& outputTensors,
-                 const ::android::sp<V1_0::IExecutionCallback>& callback);
+                 Callback callback);
 
 private:
     RequestThread(const RequestThread&) = delete;
@@ -52,12 +54,12 @@
                          std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& memPools,
                          std::shared_ptr<armnn::InputTensors>& inputTensors,
                          std::shared_ptr<armnn::OutputTensors>& outputTensors,
-                         const ::android::sp<V1_0::IExecutionCallback>& cb)
+                         Callback callback)
             : m_Model(model)
             , m_MemPools(memPools)
             , m_InputTensors(inputTensors)
             , m_OutputTensors(outputTensors)
-            , m_callback(cb)
+            , m_Callback(callback)
         {
         }
 
@@ -65,7 +67,7 @@
         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> m_MemPools;
         std::shared_ptr<armnn::InputTensors> m_InputTensors;
         std::shared_ptr<armnn::OutputTensors> m_OutputTensors;
-        const ::android::sp<V1_0::IExecutionCallback> m_callback;
+        Callback m_Callback;
     };
 
     enum class ThreadMsgType