IVGCVSW-4677 Fix FencedComputeTest Hal 1.3

* Implemented executeFenced() function in HAL 1.3 Driver

Signed-off-by: Sadik Armagan <sadik.armagan@arm.com>
Change-Id: I11f8f532e9688d4e194992b46dbed575a19be3c5
diff --git a/Android.mk b/Android.mk
index 5431a74..bf8bc65 100644
--- a/Android.mk
+++ b/Android.mk
@@ -184,6 +184,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # R or later
 
@@ -307,6 +308,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # R or later
 
@@ -421,6 +423,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # R or later
 
@@ -527,6 +530,7 @@
         libcutils \
         android.hidl.allocator@1.0 \
         android.hidl.memory@1.0 \
+        libsync \
         android.hardware.neuralnetworks@1.0 \
         android.hardware.neuralnetworks@1.1 \
         android.hardware.neuralnetworks@1.2 \
@@ -631,6 +635,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # R or later
 
@@ -729,6 +734,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # PLATFORM_VERSION == R
 
@@ -816,6 +822,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # R or later
 
@@ -890,6 +897,7 @@
         libui \
         libfmq \
         libcutils \
+        libsync \
         android.hidl.allocator@1.0 \
         android.hidl.memory@1.0 \
         android.hardware.neuralnetworks@1.0 \
diff --git a/ArmnnPreparedModel_1_3.cpp b/ArmnnPreparedModel_1_3.cpp
index 5b45b4a..6c4aec9 100644
--- a/ArmnnPreparedModel_1_3.cpp
+++ b/ArmnnPreparedModel_1_3.cpp
@@ -2,6 +2,10 @@
 // Copyright © 2020 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
+// Note: the ArmnnFencedExecutionCallback and code snippet in the executeFenced() function
+//       in this file is based on Android code
+//       under the Apache 2.0 license. See comments below for details.
+//
 
 #define LOG_TAG "ArmnnDriver"
 
@@ -9,6 +13,7 @@
 #include "Utils.hpp"
 
 #include <Utils.h>
+#include <android/sync.h>
 #include <boost/format.hpp>
 #include <log/log.h>
 #include <OperationsUtils.h>
@@ -254,10 +259,31 @@
     return Execute(request, measureTiming, cb);
 }
 
+/// This class is inspired by the sample implementation in Android named SampleFencedExecutionCallback.
+/// The original code is licensed under Apache-2.0 and can be found at the following link:
+/// https://android.googlesource.com/platform/frameworks/ml/+/master/nn/driver/sample/SampleDriver.h
+class ArmnnFencedExecutionCallback : public V1_3::IFencedExecutionCallback
+{
+public:
+    ArmnnFencedExecutionCallback(V1_3::ErrorStatus errorStatus, Timing timing, Timing fenceTiming)
+        : m_ErrorStatus(errorStatus), m_Timing(timing), m_FenceTiming(fenceTiming) {}
+    ~ArmnnFencedExecutionCallback() {}
+
+    Return<void> getExecutionInfo(getExecutionInfo_cb callback) override
+    {
+        callback(m_ErrorStatus, m_Timing, m_FenceTiming);
+        return Void();
+    }
+private:
+    V1_3::ErrorStatus m_ErrorStatus;
+    Timing m_Timing;
+    Timing m_FenceTiming;
+};
+
 template<typename HalVersion>
-Return<void> ArmnnPreparedModel_1_3<HalVersion>::executeFenced(const V1_3::Request&,
-                                                               const hidl_vec<hidl_handle>&,
-                                                               MeasureTiming,
+Return<void> ArmnnPreparedModel_1_3<HalVersion>::executeFenced(const V1_3::Request& request,
+                                                               const hidl_vec<hidl_handle>& fenceWaitFor,
+                                                               MeasureTiming measureTiming,
                                                                const OptionalTimePoint& deadline,
                                                                const OptionalTimeoutDuration& loopTimeoutDuration,
                                                                const OptionalTimeoutDuration&,
@@ -281,7 +307,104 @@
         ALOGW("ArmnnPreparedModel_1_3::executeFenced parameter loopTimeoutDuration is set but not supported.");
     }
 
-    cb(ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
+    ExecutionContext_1_3 ctx;
+    if (measureTiming == MeasureTiming::YES)
+    {
+        ctx.measureTimings = measureTiming;
+        ctx.driverStart = Now();
+    }
+
+    ALOGV("ArmnnPreparedModel_1_3::executeFenced(): %s", GetModelSummary(m_Model).c_str());
+    m_RequestCount++;
+
+    if (!android::nn::validateRequest(request, m_Model))
+    {
+        cb(ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
+        return Void();
+    }
+
+    if (!m_RequestInputsAndOutputsDumpDir.empty())
+    {
+        ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(&cb));
+    }
+
+    // This code snippet is inspired by the sample implementation in Android named SampleDriver::executeFenced()
+    // function. The original code is licensed under Apache-2.0 and can be found at the following link:
+    // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/driver/sample/SampleDriver.cpp
+    const auto fenceSize = fenceWaitFor.size();
+    for (unsigned int index = 0; index < fenceSize; ++index)
+    {
+        auto fenceNativeHandle = fenceWaitFor[index].getNativeHandle();
+        if (!fenceNativeHandle)
+        {
+            cb(ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
+            return Void();
+        }
+
+        if (sync_wait(fenceNativeHandle->data[0], -1) < 0)
+        {
+            ALOGE("ArmnnPreparedModel_1_3::executeFenced sync fence failed.");
+            cb(ErrorStatus::GENERAL_FAILURE, hidl_handle(nullptr), nullptr);
+            return Void();
+        }
+    }
+
+    TimePoint fenceExecutionStart;
+    if (measureTiming == MeasureTiming::YES)
+    {
+        fenceExecutionStart = Now();
+    }
+
+    // map the memory pool into shared pointers
+    // use a shared memory pools vector on the heap, as it is passed to the request thread
+    auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
+
+    // allocate the tensors on the heap, as they are passed to the request thread
+    auto inputs = std::make_shared<armnn::InputTensors>();
+    auto outputs = std::make_shared<armnn::OutputTensors>();
+
+    auto [status, outShapes, timings, message] = PrepareMemoryForIO(*inputs, *outputs, *memPools, request);
+    if (status != V1_3::ErrorStatus::NONE)
+    {
+        cb(ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
+        return Void();
+    }
+
+    ALOGV("ArmnnPreparedModel_1_3::executeFenced(...) before ExecuteGraph");
+
+    // call it with nullCallback for now as we will report the error status from here..
+    auto nullCallback = [](V1_3::ErrorStatus, std::vector<OutputShape>, const Timing&, std::string) {};
+    CallbackContext_1_3 cbCtx;
+    cbCtx.callback = nullCallback;
+    cbCtx.ctx = ctx;
+
+    auto errorStatus = ExecuteGraph(memPools, *inputs, *outputs, cbCtx);
+    if (errorStatus != V1_3::ErrorStatus::NONE)
+    {
+        cb(errorStatus, hidl_handle(nullptr), nullptr);
+        return Void();
+    }
+    ALOGV("ArmnnPreparedModel_1_3::executeFenced(...) after ExecuteGraph");
+
+    Timing timing = g_NoTiming;
+    Timing fenceTiming = g_NoTiming;
+    if (measureTiming == MeasureTiming::YES)
+    {
+        TimePoint driverEnd = Now();
+        timing.timeOnDevice = MicrosecondsDuration(ctx.deviceEnd, ctx.deviceStart);
+        timing.timeInDriver = MicrosecondsDuration(driverEnd, ctx.driverStart);
+        ALOGV("ArmnnPreparedModel_1_2::fenceExecutionTiming - Device = %lu Driver = %lu",
+              timing.timeOnDevice, timing.timeInDriver);
+
+        fenceTiming.timeOnDevice = MicrosecondsDuration(ctx.deviceEnd, ctx.deviceStart);
+        fenceTiming.timeInDriver = MicrosecondsDuration(driverEnd, fenceExecutionStart);
+        ALOGV("ArmnnPreparedModel_1_2::fenceFinishExecutionTiming - Device = %lu Driver = %lu",
+              fenceTiming.timeOnDevice, fenceTiming.timeInDriver);
+    }
+
+    sp<ArmnnFencedExecutionCallback> armnnFencedExecutionCallback =
+        new ArmnnFencedExecutionCallback(ErrorStatus::NONE, timing, fenceTiming);
+    cb(ErrorStatus::NONE, hidl_handle(nullptr), armnnFencedExecutionCallback);
     return Void();
 }
 
@@ -540,7 +663,7 @@
 
 template<typename HalVersion>
 template<typename CallbackContext>
-bool ArmnnPreparedModel_1_3<HalVersion>::ExecuteGraph(
+Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::ExecuteGraph(
     std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
     armnn::InputTensors& inputTensors,
     armnn::OutputTensors& outputTensors,
@@ -567,34 +690,33 @@
     {
         if (cb.ctx.measureTimings == MeasureTiming::YES)
         {
-            deviceStart = Now();
+            cb.ctx.deviceStart = Now();
         }
 
         armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors);
 
         if (cb.ctx.measureTimings == MeasureTiming::YES)
         {
-            deviceEnd = Now();
+            cb.ctx.deviceEnd = Now();
         }
         if (status != armnn::Status::Success)
         {
             ALOGW("EnqueueWorkload failed");
-            cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming,
-                        "ArmnnPreparedModel_1_3::ExecuteGraph");
-            return false;
+            cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
+            return V1_3::ErrorStatus::GENERAL_FAILURE;
         }
     }
     catch (armnn::Exception& e)
     {
         ALOGW("armnn:Exception caught from EnqueueWorkload: %s", e.what());
         cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
-        return false;
+        return V1_3::ErrorStatus::GENERAL_FAILURE;
     }
     catch (std::exception& e)
     {
         ALOGE("std::exception caught from EnqueueWorkload: %s", e.what());
         cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
-        return false;
+        return V1_3::ErrorStatus::GENERAL_FAILURE;
     }
 
     CommitPools(*pMemPools);
@@ -605,16 +727,16 @@
     {
         driverEnd = Now();
         Timing timing;
-        timing.timeOnDevice = MicrosecondsDuration(deviceEnd, deviceStart);
+        timing.timeOnDevice = MicrosecondsDuration(cb.ctx.deviceEnd, cb.ctx.deviceStart);
         timing.timeInDriver = MicrosecondsDuration(driverEnd, cb.ctx.driverStart);
         ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu", timing.timeOnDevice,
               timing.timeInDriver);
         cb.callback(V1_3::ErrorStatus::NONE, outputShapes, timing, "ArmnnPreparedModel_1_3::ExecuteGraph");
-    } else {
+    } else
+    {
         cb.callback(V1_3::ErrorStatus::NONE, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
     }
-
-    return true;
+    return V1_3::ErrorStatus::NONE;
 }
 
 template<typename HalVersion>
@@ -646,10 +768,12 @@
     callbackContext.callback = nullCallback;
     callbackContext.ctx.measureTimings = MeasureTiming::NO;
     auto memPools = std::make_shared<std::vector<::android::nn::RunTimePoolInfo>>();
-    return ExecuteGraph(memPools,
-                        inputTensors,
-                        outputTensors,
-                        callbackContext);
+
+    auto errorStatus = ExecuteGraph(memPools,
+                                    inputTensors,
+                                    outputTensors,
+                                    callbackContext);
+    return errorStatus == V1_3::ErrorStatus::NONE;
 }
 
 template<typename HalVersion>
@@ -716,7 +840,7 @@
 
 #ifdef ARMNN_ANDROID_NN_V1_3
 template class ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>;
-template bool ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>::ExecuteGraph<CallbackContext_1_3>(
+template Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>::ExecuteGraph<CallbackContext_1_3>(
         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
         armnn::InputTensors& pInputTensors,
         armnn::OutputTensors& pOutputTensors,
diff --git a/ArmnnPreparedModel_1_3.hpp b/ArmnnPreparedModel_1_3.hpp
index 47529aa..fa67405 100644
--- a/ArmnnPreparedModel_1_3.hpp
+++ b/ArmnnPreparedModel_1_3.hpp
@@ -29,6 +29,8 @@
     ::android::hardware::neuralnetworks::V1_2::MeasureTiming    measureTimings =
         ::android::hardware::neuralnetworks::V1_2::MeasureTiming::NO;
     TimePoint driverStart;
+    TimePoint deviceStart;
+    TimePoint deviceEnd;
 };
 
 using CallbackContext_1_3 = CallbackContext<CallbackAsync_1_3, ExecutionContext_1_3>;
@@ -74,7 +76,7 @@
                                           V1_3::IPreparedModel::executeSynchronously_1_3_cb cb) override;
 
     Return<void> executeFenced(const V1_3::Request& request,
-                               const android::hardware::hidl_vec<android::hardware::hidl_handle>& wait_for,
+                               const android::hardware::hidl_vec<android::hardware::hidl_handle>& fenceWaitFor,
                                MeasureTiming measure,
                                const V1_3::OptionalTimePoint& deadline,
                                const V1_3::OptionalTimeoutDuration& loopTimeoutDuration,
@@ -92,10 +94,11 @@
 
     /// execute the graph prepared from the request
     template<typename CallbackContext>
-    bool ExecuteGraph(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
-                      armnn::InputTensors& inputTensors,
-                      armnn::OutputTensors& outputTensors,
-                      CallbackContext callback);
+    Return <V1_3::ErrorStatus> ExecuteGraph(
+              std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
+              armnn::InputTensors& inputTensors,
+              armnn::OutputTensors& outputTensors,
+              CallbackContext callback);
 
     /// Executes this model with dummy inputs (e.g. all zeroes).
     /// \return false on failure, otherwise true
diff --git a/test/Android.mk b/test/Android.mk
index 17def76..af04c83 100644
--- a/test/Android.mk
+++ b/test/Android.mk
@@ -109,6 +109,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # R or later
 
@@ -218,6 +219,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # R or later
 
@@ -320,6 +322,7 @@
 
 ifeq ($(R_OR_LATER),1)
 LOCAL_SHARED_LIBRARIES+= \
+        libsync \
         android.hardware.neuralnetworks@1.3
 endif # R or later