IVGCVSW-1913: Fix for ValidationTest.concat_float_3_relaxed

* Added RefPermuteFloat16Workload to serve as a fallback when CL
  does not support the required permute configuration for FP16
* Move Half.hpp to armnnUtils as the utils library should not be
  including private header files from the armnn library

Change-Id: Ibf0f698451e8406f7ed7cce470dab60b6d16361d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d77d91..39d83c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@
 set(armnnUtils_sources)
 list(APPEND armnnUtils_sources
     src/armnnUtils/GraphTopologicalSort.hpp
+    src/armnnUtils/Half.hpp
     src/armnnUtils/Logging.hpp
     src/armnnUtils/Permute.hpp
     src/armnnUtils/Logging.cpp
@@ -219,7 +220,6 @@
     src/armnn/layers/SplitterLayer.cpp
     src/armnn/layers/SubtractionLayer.cpp
     src/armnn/layers/SubtractionLayer.hpp
-    src/armnn/Half.hpp
     src/armnn/InternalTypes.hpp
     src/armnn/InternalTypes.cpp
     src/armnn/JsonPrinter.hpp
diff --git a/src/armnn/TypeUtils.hpp b/src/armnn/TypeUtils.hpp
index e159d1f..cd19211 100644
--- a/src/armnn/TypeUtils.hpp
+++ b/src/armnn/TypeUtils.hpp
@@ -6,7 +6,7 @@
 #pragma once
 
 #include "armnn/Types.hpp"
-#include "Half.hpp"
+#include "armnnUtils/Half.hpp"
 
 namespace armnn
 {
@@ -37,4 +37,4 @@
 using ResolveType = typename ResolveTypeImpl<DT>::Type;
 
 
-} //namespace armnn
\ No newline at end of file
+} //namespace armnn
diff --git a/src/armnn/optimizations/ConvertConstants.hpp b/src/armnn/optimizations/ConvertConstants.hpp
index 9306a53..d92ea28 100644
--- a/src/armnn/optimizations/ConvertConstants.hpp
+++ b/src/armnn/optimizations/ConvertConstants.hpp
@@ -6,10 +6,12 @@
 #pragma once
 
 #include "Optimization.hpp"
+
 #include <backends/CpuTensorHandle.hpp>
-#include <Half.hpp>
 #include <FloatingPointConverter.hpp>
 
+#include <armnnUtils/Half.hpp>
+
 namespace armnn
 {
 namespace optimizations
diff --git a/src/armnn/test/FP16SupportTest.cpp b/src/armnn/test/FP16SupportTest.cpp
index 6baadc4..2706d1f 100644
--- a/src/armnn/test/FP16SupportTest.cpp
+++ b/src/armnn/test/FP16SupportTest.cpp
@@ -7,6 +7,8 @@
 #include <armnn/Descriptors.hpp>
 #include <armnn/IRuntime.hpp>
 #include <armnn/INetwork.hpp>
+#include <armnnUtils/Half.hpp>
+
 #include <Graph.hpp>
 #include <Optimizer.hpp>
 #include <backends/CpuTensorHandle.hpp>
@@ -15,7 +17,6 @@
 #include <boost/core/ignore_unused.hpp>
 #include <boost/test/unit_test.hpp>
 
-#include <Half.hpp>
 #include <set>
 
 using namespace armnn;
@@ -111,4 +112,4 @@
    BOOST_TEST(outputData == std::vector<Half>({ 101.0_h, 202.0_h, 303.0_h, 404.0_h})); // Add
 }
 
-BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/FloatingPointConverterTest.cpp b/src/armnn/test/FloatingPointConverterTest.cpp
index 3c7c8bd..ec4288c 100644
--- a/src/armnn/test/FloatingPointConverterTest.cpp
+++ b/src/armnn/test/FloatingPointConverterTest.cpp
@@ -4,7 +4,7 @@
 //
 
 #include "FloatingPointConverter.hpp"
-#include "Half.hpp"
+#include <armnnUtils/Half.hpp>
 
 #include <malloc.h>
 #include <iostream>
diff --git a/src/armnnUtils/FloatingPointConverter.cpp b/src/armnnUtils/FloatingPointConverter.cpp
index 522c44b..92409d4 100644
--- a/src/armnnUtils/FloatingPointConverter.cpp
+++ b/src/armnnUtils/FloatingPointConverter.cpp
@@ -4,7 +4,8 @@
 //
 
 #include "FloatingPointConverter.hpp"
-#include "../armnn/Half.hpp"
+
+#include "Half.hpp"
 
 #include <boost/assert.hpp>
 
diff --git a/src/armnn/Half.hpp b/src/armnnUtils/Half.hpp
similarity index 100%
rename from src/armnn/Half.hpp
rename to src/armnnUtils/Half.hpp
diff --git a/src/armnnUtils/Permute.cpp b/src/armnnUtils/Permute.cpp
index 9fe198b..61f4e0e 100644
--- a/src/armnnUtils/Permute.cpp
+++ b/src/armnnUtils/Permute.cpp
@@ -5,6 +5,7 @@
 
 #include "Permute.hpp"
 
+#include "Half.hpp"
 #include <armnn/Tensor.hpp>
 
 #include <cassert>
@@ -109,6 +110,8 @@
 
 // Instantiates for types.
 template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings,
+                      const armnn::Half* src, armnn::Half* dst);
+template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings,
                       const float* src, float* dst);
 template void Permute(const armnn::TensorShape& dstShape, const armnn::PermutationVector& mappings,
                       const uint8_t* src, uint8_t* dst);
diff --git a/src/backends/cl/workloads/ClBaseConstantWorkload.cpp b/src/backends/cl/workloads/ClBaseConstantWorkload.cpp
index 2557020..848ab5a 100644
--- a/src/backends/cl/workloads/ClBaseConstantWorkload.cpp
+++ b/src/backends/cl/workloads/ClBaseConstantWorkload.cpp
@@ -4,10 +4,11 @@
 //
 
 #include "ClBaseConstantWorkload.hpp"
+
+#include <armnnUtils/Half.hpp>
 #include <backends/aclCommon/ArmComputeTensorUtils.hpp>
 #include <backends/cl/ClTensorHandle.hpp>
 #include <backends/CpuTensorHandle.hpp>
-#include <Half.hpp>
 
 #include "ClWorkloadUtils.hpp"
 
diff --git a/src/backends/cl/workloads/ClWorkloadUtils.hpp b/src/backends/cl/workloads/ClWorkloadUtils.hpp
index 3a8ff00..af4ccd0 100644
--- a/src/backends/cl/workloads/ClWorkloadUtils.hpp
+++ b/src/backends/cl/workloads/ClWorkloadUtils.hpp
@@ -4,11 +4,11 @@
 //
 #pragma once
 
-#include "OpenClTimer.hpp"
+#include <armnnUtils/Half.hpp>
 #include <backends/aclCommon/ArmComputeTensorUtils.hpp>
 #include <backends/CpuTensorHandle.hpp>
 
-#include <Half.hpp>
+#include "OpenClTimer.hpp"
 
 #define ARMNN_SCOPED_PROFILING_EVENT_CL(name) \
     ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::GpuAcc, \
diff --git a/src/backends/neon/workloads/NeonBaseConstantWorkload.hpp b/src/backends/neon/workloads/NeonBaseConstantWorkload.hpp
index 6bb275a..828e476 100644
--- a/src/backends/neon/workloads/NeonBaseConstantWorkload.hpp
+++ b/src/backends/neon/workloads/NeonBaseConstantWorkload.hpp
@@ -6,12 +6,12 @@
 #pragma once
 
 #include <arm_compute/core/Types.h>
+#include <armnnUtils/Half.hpp>
 #include <backends/aclCommon/ArmComputeTensorUtils.hpp>
 #include <backends/neon/NeonTensorHandle.hpp>
 #include <backends/neon/workloads/NeonWorkloadUtils.hpp>
 #include <backends/CpuTensorHandle.hpp>
 #include <backends/Workload.hpp>
-#include <Half.hpp>
 
 #include <boost/cast.hpp>
 
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
index 84e341f..c8a3f27 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
@@ -4,9 +4,9 @@
 //
 
 #include "NeonConvertFp16ToFp32Workload.hpp"
-#include <Half.hpp>
 #include <FloatingPointConverter.hpp>
 
+#include <armnnUtils/Half.hpp>
 #include <backends/WorkloadUtils.hpp>
 
 namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
index 261de3d..6bcf6e0 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
@@ -5,9 +5,8 @@
 
 #include "NeonConvertFp32ToFp16Workload.hpp"
 
-#include <Half.hpp>
+#include <armnnUtils/Half.hpp>
 #include <FloatingPointConverter.hpp>
-
 #include <Profiling.hpp>
 #include <backends/WorkloadUtils.hpp>
 
diff --git a/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.cpp b/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.cpp
index 547f563..3b9626d 100644
--- a/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.cpp
+++ b/src/backends/neon/workloads/NeonConvolution2dBaseWorkload.cpp
@@ -10,7 +10,7 @@
 #include "NeonConvolution2dBaseWorkload.hpp"
 
 #include <armnn/Types.hpp>
-#include <Half.hpp>
+#include <armnnUtils/Half.hpp>
 
 namespace armnn
 {
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 582c691..d7d2e27 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -114,7 +114,8 @@
 std::unique_ptr<armnn::IWorkload> RefWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
                                                                     const WorkloadInfo&           info) const
 {
-    return MakeWorkload<RefPermuteFloat32Workload, RefPermuteUint8Workload>(descriptor, info);
+    return armnn::MakeWorkload<RefPermuteFloat16Workload, RefPermuteFloat32Workload, RefPermuteUint8Workload>
+        (descriptor, info);
 }
 
 std::unique_ptr<armnn::IWorkload> RefWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.cpp b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.cpp
index e148bf6..b01246b 100644
--- a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.cpp
+++ b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.cpp
@@ -4,10 +4,12 @@
 //
 
 #include "RefConvertFp16ToFp32Workload.hpp"
-#include "Half.hpp"
+
 #include "RefWorkloadUtils.hpp"
 #include "FloatingPointConverter.hpp"
 
+#include <armnnUtils/Half.hpp>
+
 namespace armnn
 {
 
diff --git a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.cpp b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.cpp
index efaaf8e..99e3541 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.cpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.cpp
@@ -5,12 +5,12 @@
 
 #include "RefConvertFp32ToFp16Workload.hpp"
 
-#include "Half.hpp"
 #include "FloatingPointConverter.hpp"
 #include "RefWorkloadUtils.hpp"
-
 #include "Profiling.hpp"
 
+#include "armnnUtils/Half.hpp"
+
 namespace armnn
 {
 
diff --git a/src/backends/reference/workloads/RefPermuteWorkload.cpp b/src/backends/reference/workloads/RefPermuteWorkload.cpp
index 4093ff3..df50156 100644
--- a/src/backends/reference/workloads/RefPermuteWorkload.cpp
+++ b/src/backends/reference/workloads/RefPermuteWorkload.cpp
@@ -26,6 +26,7 @@
     armnnUtils::Permute(GetTensorInfo(dst).GetShape(), mappings, GetConstCpuData<T>(src), GetCpuData<T>(dst));
 }
 
+template class RefPermuteWorkload<DataType::Float16>;
 template class RefPermuteWorkload<DataType::Float32>;
 template class RefPermuteWorkload<DataType::QuantisedAsymm8>;
 
diff --git a/src/backends/reference/workloads/RefPermuteWorkload.hpp b/src/backends/reference/workloads/RefPermuteWorkload.hpp
index 2cc176d..841a080 100644
--- a/src/backends/reference/workloads/RefPermuteWorkload.hpp
+++ b/src/backends/reference/workloads/RefPermuteWorkload.hpp
@@ -27,6 +27,7 @@
     void Execute() const override;
 };
 
+using RefPermuteFloat16Workload = RefPermuteWorkload<DataType::Float16>;
 using RefPermuteFloat32Workload = RefPermuteWorkload<DataType::Float32>;
 using RefPermuteUint8Workload   = RefPermuteWorkload<DataType::QuantisedAsymm8>;
 
diff --git a/src/backends/reference/workloads/RefWorkloadUtils.hpp b/src/backends/reference/workloads/RefWorkloadUtils.hpp
index 153c519..67a1f5e 100644
--- a/src/backends/reference/workloads/RefWorkloadUtils.hpp
+++ b/src/backends/reference/workloads/RefWorkloadUtils.hpp
@@ -9,7 +9,7 @@
 
 #include <armnn/Tensor.hpp>
 #include <armnn/Types.hpp>
-#include <Half.hpp>
+#include <armnnUtils/Half.hpp>
 
 #include <boost/polymorphic_cast.hpp>
 
diff --git a/src/backends/test/ConvertFp16ToFp32TestImpl.hpp b/src/backends/test/ConvertFp16ToFp32TestImpl.hpp
index b75879d..483689d 100644
--- a/src/backends/test/ConvertFp16ToFp32TestImpl.hpp
+++ b/src/backends/test/ConvertFp16ToFp32TestImpl.hpp
@@ -8,13 +8,13 @@
 #include <armnn/ArmNN.hpp>
 #include <armnn/Tensor.hpp>
 #include <armnn/TypesUtils.hpp>
+#include <armnnUtils/Half.hpp>
 
 #include <backends/WorkloadInfo.hpp>
 #include <backends/CpuTensorHandle.hpp>
 
 #include <test/TensorHelpers.hpp>
 
-#include <Half.hpp>
 
 LayerTestResult<float, 4> SimpleConvertFp16ToFp32Test(armnn::IWorkloadFactory& workloadFactory)
 {
diff --git a/src/backends/test/ConvertFp32ToFp16TestImpl.hpp b/src/backends/test/ConvertFp32ToFp16TestImpl.hpp
index 1325b4b..e4698a9 100644
--- a/src/backends/test/ConvertFp32ToFp16TestImpl.hpp
+++ b/src/backends/test/ConvertFp32ToFp16TestImpl.hpp
@@ -8,13 +8,13 @@
 #include <armnn/ArmNN.hpp>
 #include <armnn/Tensor.hpp>
 #include <armnn/TypesUtils.hpp>
+#include <armnnUtils/Half.hpp>
 
 #include <backends/WorkloadInfo.hpp>
 #include <backends/CpuTensorHandle.hpp>
 
 #include <test/TensorHelpers.hpp>
 
-#include <Half.hpp>
 
 LayerTestResult<armnn::Half, 4> SimpleConvertFp32ToFp16Test(armnn::IWorkloadFactory& workloadFactory)
 {
@@ -52,4 +52,4 @@
     CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
 
     return ret;
-}
\ No newline at end of file
+}
diff --git a/src/backends/test/LayerTests.hpp b/src/backends/test/LayerTests.hpp
index 9dc3afa..8939903 100644
--- a/src/backends/test/LayerTests.hpp
+++ b/src/backends/test/LayerTests.hpp
@@ -6,7 +6,7 @@
 
 #include "armnn/ArmNN.hpp"
 #include "armnn/Tensor.hpp"
-#include "Half.hpp"
+#include "armnnUtils/Half.hpp"
 
 #include <boost/multi_array.hpp>
 #include <boost/assert.hpp>
diff --git a/src/backends/test/TensorCopyUtils.cpp b/src/backends/test/TensorCopyUtils.cpp
index e92469a..7e17e8b 100644
--- a/src/backends/test/TensorCopyUtils.cpp
+++ b/src/backends/test/TensorCopyUtils.cpp
@@ -3,13 +3,11 @@
 // SPDX-License-Identifier: MIT
 //
 
-#include <algorithm>
-#include <cstring>
-#include <boost/cast.hpp>
-#include <Half.hpp>
-
 #include "TensorCopyUtils.hpp"
 
+#include <armnnUtils/Half.hpp>
+
+
 #ifdef ARMCOMPUTECL_ENABLED
 #include <backends/cl/ClTensorHandle.hpp>
 #endif
@@ -24,6 +22,10 @@
 
 #include <backends/CpuTensorHandle.hpp>
 
+#include <boost/cast.hpp>
+#include <algorithm>
+#include <cstring>
+
 void CopyDataToITensorHandle(armnn::ITensorHandle* tensorHandle, const void* mem)
 {
     switch (tensorHandle->GetType())