Make Convert workloads use arm_compute::NECast in CpuAcc backend

NECast can use conversion instructions where they are available
so this should in general be faster.

Signed-off-by: Matthew Bentham <Matthew.Bentham@arm.com>
Change-Id: I3f259e17b280a4f4c36f363965ffbc8ee8c4c29f
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
index ce6c785..f65d719 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
@@ -11,22 +11,56 @@
 
 #include <backendsCommon/WorkloadUtils.hpp>
 
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
 namespace armnn
 {
 
+arm_compute::Status NeonConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, const TensorInfo& output)
+{
+    // Fallback to portable software implementation if Compute Library NECast won't work, so
+    // this method always returns success
+
+    armnn::IgnoreUnused(input);
+    armnn::IgnoreUnused(output);
+    return arm_compute::Status();
+}
+
 NeonConvertFp16ToFp32Workload::NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor,
                                                              const WorkloadInfo& info)
      : Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info)
 {
     this->m_Data.ValidateInputsOutputs("NeonConvertFp16ToFp32Workload", 1, 1);
-    GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+
+    arm_compute::ITensor& input  = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ITensor& output = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    if (arm_compute::NECast::validate(input.info(), output.info(), g_AclConvertPolicy))
+    {
+        // Use NECast if supported (needs hardware support for FP16)
+        m_Cast.reset(new arm_compute::NECast());
+        m_Cast->configure(&input, &output, g_AclConvertPolicy);
+    }
+    else
+    {
+        // Else use software implementation using Half.hpp
+        GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+    }
 }
 
 void NeonConvertFp16ToFp32Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConvertFp16ToFp32Workload_Execute", this->GetGuid());
 
-    auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+    if (m_Cast)
+    {
+        // Use NECast if supported and initialised
+        m_Cast->run();
+    }
+    else
+    {
+        // Else use softare implementation using Half.hpp
+        auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
         {
             auto input = reinterpret_cast<const Half*>(src);
             auto output = reinterpret_cast<float*>(dst);
@@ -34,9 +68,10 @@
             armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output);
         };
 
-    for (const auto& pair : m_TensorHandlePairs)
-    {
-        CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+        for (const auto& pair : m_TensorHandlePairs)
+        {
+            CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+        }
     }
 }