Make Convert workloads use arm_compute::NECast in CpuAcc backend

NECast can use conversion instructions where they are available
so this should in general be faster.

Signed-off-by: Matthew Bentham <Matthew.Bentham@arm.com>
Change-Id: I3f259e17b280a4f4c36f363965ffbc8ee8c4c29f
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index 672b2f3..4e4d7fa 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -32,6 +32,8 @@
 #include "workloads/NeonComparisonWorkload.hpp"
 #include "workloads/NeonConcatWorkload.hpp"
 #include "workloads/NeonConstantWorkload.hpp"
+#include "workloads/NeonConvertFp16ToFp32Workload.hpp"
+#include "workloads/NeonConvertFp32ToFp16Workload.hpp"
 #include "workloads/NeonConvolution2dWorkload.hpp"
 #include "workloads/NeonConvolution3dWorkload.hpp"
 #include "workloads/NeonDepthToSpaceWorkload.hpp"
@@ -887,20 +889,20 @@
                                                     const TensorInfo& output,
                                                     Optional<std::string&> reasonIfUnsupported) const
 {
-    armnn::IgnoreUnused(input);
-    armnn::IgnoreUnused(output);
-    armnn::IgnoreUnused(reasonIfUnsupported);
-    return true;
+    FORWARD_WORKLOAD_VALIDATE_FUNC(NeonConvertFp16ToFp32WorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input,
+                                   output);
 }
 
 bool NeonLayerSupport::IsConvertFp32ToFp16Supported(const TensorInfo& input,
                                                     const TensorInfo& output,
                                                     Optional<std::string&> reasonIfUnsupported) const
 {
-    armnn::IgnoreUnused(input);
-    armnn::IgnoreUnused(output);
-    armnn::IgnoreUnused(reasonIfUnsupported);
-    return true;
+    FORWARD_WORKLOAD_VALIDATE_FUNC(NeonConvertFp32ToFp16WorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input,
+                                   output);
 }
 
 bool NeonLayerSupport::IsConvolution2dSupported(const TensorInfo& input,
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
index ce6c785..f65d719 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
@@ -11,22 +11,56 @@
 
 #include <backendsCommon/WorkloadUtils.hpp>
 
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
 namespace armnn
 {
 
+arm_compute::Status NeonConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, const TensorInfo& output)
+{
+    // Fallback to portable software implementation if Compute Library NECast won't work, so
+    // this method always returns success
+
+    armnn::IgnoreUnused(input);
+    armnn::IgnoreUnused(output);
+    return arm_compute::Status();
+}
+
 NeonConvertFp16ToFp32Workload::NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor,
                                                              const WorkloadInfo& info)
      : Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info)
 {
     this->m_Data.ValidateInputsOutputs("NeonConvertFp16ToFp32Workload", 1, 1);
-    GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+
+    arm_compute::ITensor& input  = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ITensor& output = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    if (arm_compute::NECast::validate(input.info(), output.info(), g_AclConvertPolicy))
+    {
+        // Use NECast if supported (needs hardware support for FP16)
+        m_Cast.reset(new arm_compute::NECast());
+        m_Cast->configure(&input, &output, g_AclConvertPolicy);
+    }
+    else
+    {
+        // Else use software implementation using Half.hpp
+        GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+    }
 }
 
 void NeonConvertFp16ToFp32Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConvertFp16ToFp32Workload_Execute", this->GetGuid());
 
-    auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+    if (m_Cast)
+    {
+        // Use NECast if supported and initialised
+        m_Cast->run();
+    }
+    else
+    {
+        // Else use softare implementation using Half.hpp
+        auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
         {
             auto input = reinterpret_cast<const Half*>(src);
             auto output = reinterpret_cast<float*>(dst);
@@ -34,9 +68,10 @@
             armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output);
         };
 
-    for (const auto& pair : m_TensorHandlePairs)
-    {
-        CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+        for (const auto& pair : m_TensorHandlePairs)
+        {
+            CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+        }
     }
 }
 
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
index c0165ea..c5a2378 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
@@ -5,13 +5,18 @@
 
 #pragma once
 
+#include <arm_compute/runtime/NEON/functions/NECast.h>
 #include <armnn/backends/Workload.hpp>
 #include <armnn/backends/WorkloadData.hpp>
+#include <memory>
 #include <neon/workloads/NeonWorkloadUtils.hpp>
 
+
 namespace armnn
 {
 
+arm_compute::Status NeonConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, const TensorInfo& output);
+
 class NeonConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>
 {
 public:
@@ -26,6 +31,7 @@
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
     virtual void Reconfigure();
+    mutable std::unique_ptr<arm_compute::NECast> m_Cast;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
index 089716a..017ed98 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
@@ -5,6 +5,7 @@
 
 #include "NeonConvertFp32ToFp16Workload.hpp"
 
+#include <arm_compute/runtime/NEON/functions/NECast.h>
 #include <Half.hpp>
 #include <Profiling.hpp>
 
@@ -12,32 +13,67 @@
 
 #include <backendsCommon/WorkloadUtils.hpp>
 
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
 namespace armnn
 {
 
+arm_compute::Status NeonConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, const TensorInfo& output)
+{
+    // Fallback to portable software implementation if Compute Library NECast won't work, so
+    // this method always returns success
+
+    armnn::IgnoreUnused(input);
+    armnn::IgnoreUnused(output);
+    return arm_compute::Status();
+}
+
 NeonConvertFp32ToFp16Workload::NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor,
                                                              const WorkloadInfo& info)
     : Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info)
 {
     this->m_Data.ValidateInputsOutputs("NeonConvertFp32ToFp16Workload", 1, 1);
-    GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+
+    arm_compute::ITensor& input  = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ITensor& output = PolymorphicDowncast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    if (arm_compute::NECast::validate(input.info(), output.info(), g_AclConvertPolicy))
+    {
+        // Use NECast if supported (needs hardware support for FP16)
+        m_Cast.reset(new arm_compute::NECast);
+        m_Cast->configure(&input, &output, g_AclConvertPolicy);
+    }
+    else
+    {
+        // Else use software implementation from Half.hpp
+        GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+    }
 }
 
 void NeonConvertFp32ToFp16Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonConvertFp32ToFp16Workload_Execute", this->GetGuid());
 
-    auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
-        {
-            auto input = reinterpret_cast<const float*>(src);
-            auto output = reinterpret_cast<Half*>(dst);
-            size_t numElements = size/2; // 2 bytes per fp16
-            armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output);
-        };
-
-    for (const auto& pair : m_TensorHandlePairs)
+    if (m_Cast)
     {
-        CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+        // Use NECast if supported and initialised
+        m_Cast->run();
+    }
+    else
+    {
+        // Else use softwre implementabion using Half.hpp
+        auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+            {
+                auto input = reinterpret_cast<const float*>(src);
+                auto output = reinterpret_cast<Half*>(dst);
+                size_t numElements = size/2; // 2 bytes per fp16
+                armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output);
+            };
+
+        for (const auto& pair : m_TensorHandlePairs)
+        {
+            CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+        }
     }
 }
 
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
index 666f487..c6fed76 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
@@ -5,13 +5,17 @@
 
 #pragma once
 
+#include <arm_compute/runtime/NEON/functions/NECast.h>
 #include <armnn/backends/Workload.hpp>
 #include <armnn/backends/WorkloadData.hpp>
+#include <memory>
 #include <neon/workloads/NeonWorkloadUtils.hpp>
 
 namespace armnn
 {
 
+arm_compute::Status NeonConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, const TensorInfo& output);
+
 class NeonConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>
 {
 public:
@@ -23,9 +27,10 @@
     // Replace output tensor handle with the given TensorHandle
     void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
+    virtual void Reconfigure();
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
-    virtual void Reconfigure();
+    mutable std::unique_ptr<arm_compute::NECast> m_Cast;
 };
 
 } //namespace armnn