Refactor: Don't include all ComputeLibrary function definitions everywhere.

Just include the function definition that is specifically needed for each workload.
Also, tighten up the scope where Compute Library functions are available.

Knocks about 30seconds off a 4m30s single-threaded compile of the Neon workloads.

Change-Id: Idac438f3bc77ff978295fbc9505cb42447def145
diff --git a/src/backends/neon/workloads/NeonActivationWorkload.cpp b/src/backends/neon/workloads/NeonActivationWorkload.cpp
index 6e95678..c75a138 100644
--- a/src/backends/neon/workloads/NeonActivationWorkload.cpp
+++ b/src/backends/neon/workloads/NeonActivationWorkload.cpp
@@ -4,8 +4,11 @@
 //
 
 #include "NeonActivationWorkload.hpp"
+#include "NeonWorkloadUtils.hpp"
 #include <aclCommon/ArmComputeUtils.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
+
 namespace armnn
 {
 
@@ -43,13 +46,16 @@
     arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_ActivationLayer.configure(&input, &output, activationLayerInfo);
+    auto layer = std::make_unique<arm_compute::NEActivationLayer>();
+    layer->configure(&input, &output, activationLayerInfo);
+
+    m_ActivationLayer.reset(layer.release());
 }
 
 void NeonActivationWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationWorkload_Execute");
-    m_ActivationLayer.run();
+    m_ActivationLayer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonActivationWorkload.hpp b/src/backends/neon/workloads/NeonActivationWorkload.hpp
index fc7c646..eefbfb6 100644
--- a/src/backends/neon/workloads/NeonActivationWorkload.hpp
+++ b/src/backends/neon/workloads/NeonActivationWorkload.hpp
@@ -5,7 +5,10 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
 
 namespace armnn
 {
@@ -21,7 +24,7 @@
     void Execute() const override;
 
 private:
-    mutable arm_compute::NEActivationLayer m_ActivationLayer;
+    std::unique_ptr<arm_compute::IFunction> m_ActivationLayer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonAdditionWorkload.cpp b/src/backends/neon/workloads/NeonAdditionWorkload.cpp
index 70a3909..fa53781 100644
--- a/src/backends/neon/workloads/NeonAdditionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonAdditionWorkload.cpp
@@ -4,9 +4,13 @@
 //
 
 #include "NeonAdditionWorkload.hpp"
+#include "NeonWorkloadUtils.hpp"
+
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
+
 namespace armnn
 {
 
@@ -35,13 +39,15 @@
     arm_compute::ITensor& input2 = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_AddLayer.configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE);
+    auto layer = std::make_unique<arm_compute::NEArithmeticAddition>();
+    layer->configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE);
+    m_AddLayer.reset(layer.release());
 }
 
 void NeonAdditionWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonAdditionWorkload_Execute");
-    m_AddLayer.run();
+    m_AddLayer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonAdditionWorkload.hpp b/src/backends/neon/workloads/NeonAdditionWorkload.hpp
index ca8ae8d7b..826fb1f 100644
--- a/src/backends/neon/workloads/NeonAdditionWorkload.hpp
+++ b/src/backends/neon/workloads/NeonAdditionWorkload.hpp
@@ -5,7 +5,10 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
 
 namespace armnn
 {
@@ -21,7 +24,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NEArithmeticAddition m_AddLayer;
+    std::unique_ptr<arm_compute::IFunction> m_AddLayer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
index 44d5035..fc80f41 100644
--- a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
+++ b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
@@ -4,9 +4,13 @@
 //
 
 #include "NeonBatchNormalizationWorkload.hpp"
+
+#include "NeonWorkloadUtils.hpp"
+
 #include <backendsCommon/CpuTensorHandle.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
-#include <armnn/ArmNN.hpp>
+
+#include <arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h>
 
 namespace armnn
 {
@@ -68,13 +72,15 @@
     m_Beta = std::make_unique<arm_compute::Tensor>();
     BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo());
 
-    m_Layer.configure(&input,
-                      &output,
-                      m_Mean.get(),
-                      m_Variance.get(),
-                      m_Beta.get(),
-                      m_Gamma.get(),
-                      m_Data.m_Parameters.m_Eps);
+    auto layer = std::make_unique<arm_compute::NEBatchNormalizationLayer>();
+    layer->configure(&input,
+                     &output,
+                     m_Mean.get(),
+                     m_Variance.get(),
+                     m_Beta.get(),
+                     m_Gamma.get(),
+                     m_Data.m_Parameters.m_Eps);
+    m_Layer.reset(layer.release());
 
     InitializeArmComputeTensorData(*m_Mean, m_Data.m_Mean);
     InitializeArmComputeTensorData(*m_Variance, m_Data.m_Variance);
@@ -83,14 +89,14 @@
 
     // Force Compute Library to perform the necessary copying and reshaping, after which
     // delete all the input tensors that will no longer be needed
-    m_Layer.prepare();
+    m_Layer->prepare();
     FreeUnusedTensors();
 }
 
 void NeonBatchNormalizationWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonBatchNormalizationWorkload_Execute");
-    m_Layer.run();
+    m_Layer->run();
 }
 
 void NeonBatchNormalizationWorkload::FreeUnusedTensors()
diff --git a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.hpp b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.hpp
index 52e4db7..3619ea0 100644
--- a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.hpp
+++ b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.hpp
@@ -5,7 +5,12 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/Tensor.h>
+
+#include <memory>
 
 namespace armnn
 {
@@ -26,7 +31,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NEBatchNormalizationLayer m_Layer;
+    std::unique_ptr<arm_compute::IFunction> m_Layer;
 
     std::unique_ptr<arm_compute::Tensor> m_Mean;
     std::unique_ptr<arm_compute::Tensor> m_Variance;
diff --git a/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp b/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp
index 151132f..1080f32 100644
--- a/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp
+++ b/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp
@@ -7,7 +7,9 @@
 
 #include <backendsCommon/CpuTensorHandle.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
-#include <neon/NeonLayerSupport.hpp>
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
 
 #include <armnn/Types.hpp>
 #include <Half.hpp>
diff --git a/src/backends/neon/workloads/NeonConvolution2dWorkload.hpp b/src/backends/neon/workloads/NeonConvolution2dWorkload.hpp
index daf9a43..3fb408d 100644
--- a/src/backends/neon/workloads/NeonConvolution2dWorkload.hpp
+++ b/src/backends/neon/workloads/NeonConvolution2dWorkload.hpp
@@ -5,12 +5,10 @@
 
 #pragma once
 
-#include <aclCommon/ArmComputeTensorUtils.hpp>
-#include <backendsCommon/CpuTensorHandle.hpp>
-#include <neon/NeonLayerSupport.hpp>
-#include <neon/workloads/NeonWorkloadUtils.hpp>
 #include <backendsCommon/Workload.hpp>
 
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/Tensor.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
 
 #include <memory>
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
index be26359..c915555 100644
--- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
@@ -5,11 +5,18 @@
 
 #include "NeonDepthwiseConvolutionWorkload.hpp"
 
+#include "NeonWorkloadUtils.hpp"
+
+#include <DataLayoutIndexed.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <neon/NeonLayerSupport.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 #include <backendsCommon/WorkloadUtils.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
+
+using namespace armnnUtils;
+
 namespace armnn
 {
 
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.hpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.hpp
index b5f2ae9..85932d3 100644
--- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.hpp
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.hpp
@@ -5,7 +5,12 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/Tensor.h>
+
+#include <memory>
 
 namespace armnn
 {
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
index a08ba8a..f024fef 100644
--- a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
@@ -5,6 +5,12 @@
 
 #include "NeonFloorFloatWorkload.hpp"
 
+#include "NeonWorkloadUtils.hpp"
+
+#include <arm_compute/runtime/NEON/functions/NEFloor.h>
+
+#include <boost/polymorphic_cast.hpp>
+
 namespace armnn
 {
 NeonFloorFloatWorkload::NeonFloorFloatWorkload(const FloorQueueDescriptor& descriptor,
@@ -16,13 +22,15 @@
     arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(&input, &output);
+    auto layer = std::make_unique<arm_compute::NEFloor>();
+    layer->configure(&input, &output);
+    m_Layer.reset(layer.release());
 }
 
 void NeonFloorFloatWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFloorFloatWorkload_Execute");
-    m_Layer.run();
+    m_Layer->run();
 }
 } //namespace armnn
 
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
index a4ce476..01b86a6 100644
--- a/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
@@ -5,7 +5,12 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/Tensor.h>
+
+#include <memory>
 
 namespace armnn
 {
@@ -17,7 +22,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NEFloor m_Layer;
+    std::unique_ptr<arm_compute::IFunction> m_Layer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp b/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp
index e432a6b..7395270 100644
--- a/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp
+++ b/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp
@@ -5,10 +5,13 @@
 
 #include "NeonFullyConnectedWorkload.hpp"
 
+#include "NeonWorkloadUtils.hpp"
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <aclCommon/ArmComputeUtils.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+
 namespace armnn
 {
 using namespace armcomputetensorutils;
@@ -45,7 +48,6 @@
 NeonFullyConnectedWorkload::NeonFullyConnectedWorkload(const FullyConnectedQueueDescriptor& descriptor,
     const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
     : BaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
-    , m_FullyConnectedLayer(memoryManager)
 {
     m_Data.ValidateInputsOutputs("NeonFullyConnectedWorkload", 1, 1);
 
@@ -64,7 +66,10 @@
     // Construct
     arm_compute::FullyConnectedLayerInfo fc_info;
     fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix;
-    m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info);
+
+    auto layer = std::make_unique<arm_compute::NEFullyConnectedLayer>(memoryManager);
+    layer->configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info);
+    m_FullyConnectedLayer.reset(layer.release());
 
     // Allocate
     if (m_Data.m_Weight->GetTensorInfo().GetDataType() == DataType::QuantisedAsymm8)
@@ -90,14 +95,14 @@
 
     // Force Compute Library to perform the necessary copying and reshaping, after which
     // delete all the input tensors that will no longer be needed
-    m_FullyConnectedLayer.prepare();
+    m_FullyConnectedLayer->prepare();
     FreeUnusedTensors();
 }
 
 void NeonFullyConnectedWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFullyConnectedWorkload_Execute");
-    m_FullyConnectedLayer.run();
+    m_FullyConnectedLayer->run();
 }
 
 void NeonFullyConnectedWorkload::FreeUnusedTensors()
diff --git a/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp b/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp
index ec1661d..1cd8be1 100644
--- a/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp
+++ b/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp
@@ -5,9 +5,12 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
 
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
+#include <arm_compute/runtime/Tensor.h>
 
 #include <memory>
 
@@ -28,7 +31,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NEFullyConnectedLayer m_FullyConnectedLayer;
+    std::unique_ptr<arm_compute::IFunction> m_FullyConnectedLayer;
 
     std::unique_ptr<arm_compute::Tensor> m_WeightsTensor;
     std::unique_ptr<arm_compute::Tensor> m_BiasesTensor;
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
index afaa700..99bbcfa 100644
--- a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
@@ -4,8 +4,13 @@
 //
 
 #include "NeonL2NormalizationFloatWorkload.hpp"
+
+#include "NeonWorkloadUtils.hpp"
+
 #include <aclCommon/ArmComputeUtils.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h>
+
 namespace armnn
 {
 using namespace armcomputetensorutils;
@@ -25,7 +30,6 @@
 NeonL2NormalizationFloatWorkload::NeonL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor,
     const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
     : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info)
-    , m_Layer(memoryManager)
 {
     m_Data.ValidateInputsOutputs("NeonL2NormalizationFloatWorkload", 1, 1);
 
@@ -38,13 +42,15 @@
 
     unsigned int axis = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 2 : 0;
 
-    m_Layer.configure(&input, &output, axis);
+    auto layer = std::make_unique<arm_compute::NEL2NormalizeLayer>(memoryManager);
+    layer->configure(&input, &output, axis);
+    m_Layer.reset(layer.release());
 }
 
 void NeonL2NormalizationFloatWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonL2NormalizationFloatWorkload_Execute");
-    m_Layer.run();
+    m_Layer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
index 30058c5..2a8eb38 100644
--- a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
@@ -5,7 +5,10 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
 
 #include <memory>
@@ -25,7 +28,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NEL2NormalizeLayer m_Layer;
+    std::unique_ptr<arm_compute::IFunction> m_Layer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonMergerWorkload.cpp b/src/backends/neon/workloads/NeonMergerWorkload.cpp
index f82e244..be096b4 100644
--- a/src/backends/neon/workloads/NeonMergerWorkload.cpp
+++ b/src/backends/neon/workloads/NeonMergerWorkload.cpp
@@ -4,11 +4,14 @@
 //
 
 #include "NeonMergerWorkload.hpp"
-#include <armnn/ArmNN.hpp>
+
+#include "NeonWorkloadUtils.hpp"
+
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 #include <neon/NeonTensorHandle.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NEConcatenateLayer.h>
 
 namespace armnn
 {
@@ -66,9 +69,11 @@
 
     arm_compute::DataLayoutDimension aclAxis = arm_compute::DataLayoutDimension::WIDTH;
 
-    m_Layer.configure(aclInputs, &output, aclAxis);
+    auto layer = std::make_unique<arm_compute::NEConcatenateLayer>();
+    layer->configure(aclInputs, &output, aclAxis);
+    m_Layer.reset(layer.release());
 
-    m_Layer.prepare();
+    m_Layer->prepare();
 }
 
 void NeonMergerWorkload::Execute() const
@@ -76,7 +81,7 @@
     if (m_Execute)
     {
         ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerWorkload_Execute");
-        m_Layer.run();
+        m_Layer->run();
     }
 }
 
diff --git a/src/backends/neon/workloads/NeonMergerWorkload.hpp b/src/backends/neon/workloads/NeonMergerWorkload.hpp
index a4f36d1..3432c62 100644
--- a/src/backends/neon/workloads/NeonMergerWorkload.hpp
+++ b/src/backends/neon/workloads/NeonMergerWorkload.hpp
@@ -6,7 +6,11 @@
 #pragma once
 
 #include <backendsCommon/Workload.hpp>
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
+#
+#include <memory>
 
 namespace armnn
 {
@@ -23,7 +27,7 @@
     void Execute() const override;
 
 private:
-    mutable arm_compute::NEConcatenateLayer m_Layer;
+    std::unique_ptr<arm_compute::IFunction> m_Layer;
     bool m_Execute;
 
 };
diff --git a/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.cpp b/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.cpp
index c4241ec..778e782 100644
--- a/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.cpp
@@ -5,6 +5,9 @@
 
 #include "NeonMultiplicationFloatWorkload.hpp"
 
+#include "NeonWorkloadUtils.hpp"
+
+#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h>
 
 namespace armnn
 {
@@ -41,18 +44,20 @@
     // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it,
     // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be
     // ignored for F32 tensors.
-    m_PixelWiseMultiplication.configure(&input1,
-                                        &input2,
-                                        &output,
-                                        1.0f,
-                                        arm_compute::ConvertPolicy::SATURATE,
-                                        arm_compute::RoundingPolicy::TO_ZERO);
+    auto layer = std::make_unique<arm_compute::NEPixelWiseMultiplication>();
+    layer->configure(&input1,
+                     &input2,
+                     &output,
+                     1.0f,
+                     arm_compute::ConvertPolicy::SATURATE,
+                     arm_compute::RoundingPolicy::TO_ZERO);
+    m_PixelWiseMultiplication.reset(layer.release());
 }
 
 void NeonMultiplicationFloatWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMultiplicationFloatWorkload_Execute");
-    m_PixelWiseMultiplication.run();
+    m_PixelWiseMultiplication->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.hpp b/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.hpp
index 8fa3171..a65ad4e 100644
--- a/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonMultiplicationFloatWorkload.hpp
@@ -5,7 +5,12 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
+
+#include <memory>
 
 namespace armnn
 {
@@ -20,7 +25,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NEPixelWiseMultiplication m_PixelWiseMultiplication;
+    std::unique_ptr<arm_compute::IFunction> m_PixelWiseMultiplication;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
index 854ecd3..92c0396 100644
--- a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
@@ -4,10 +4,13 @@
 //
 
 #include "NeonNormalizationFloatWorkload.hpp"
-#include <neon/NeonLayerSupport.hpp>
+
+#include "NeonWorkloadUtils.hpp"
 #include <aclCommon/ArmComputeUtils.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NENormalizationLayer.h>
+
 using namespace armnn::armcomputetensorutils;
 
 namespace armnn
@@ -57,7 +60,6 @@
                                                    const WorkloadInfo& info,
                                                    std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
     : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info)
-    , m_NormalizationLayer(memoryManager)
 {
     m_Data.ValidateInputsOutputs("NeonNormalizationFloatWorkload", 1, 1);
     std::string reasonIfUnsupported;
@@ -89,14 +91,15 @@
                                                           m_Data.m_Parameters.m_Beta,
                                                           m_Data.m_Parameters.m_K,
                                                           false);
-
-    m_NormalizationLayer.configure(&input, &output, normalizationInfo);
+    auto layer = std::make_unique<arm_compute::NENormalizationLayer>(memoryManager);
+    layer->configure(&input, &output, normalizationInfo);
+    m_NormalizationLayer.reset(layer.release());
 }
 
 void NeonNormalizationFloatWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonNormalizationFloatWorkload_Execute");
-    m_NormalizationLayer.run();
+    m_NormalizationLayer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
index 89eba57..17bbeb4 100644
--- a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
@@ -5,9 +5,14 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
 
+#include <memory>
+
 namespace armnn
 {
 
@@ -23,7 +28,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NENormalizationLayer m_NormalizationLayer;
+    std::unique_ptr<arm_compute::IFunction> m_NormalizationLayer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonPooling2dWorkload.cpp b/src/backends/neon/workloads/NeonPooling2dWorkload.cpp
index 9c8f71a..75bceb1 100644
--- a/src/backends/neon/workloads/NeonPooling2dWorkload.cpp
+++ b/src/backends/neon/workloads/NeonPooling2dWorkload.cpp
@@ -4,11 +4,15 @@
 //
 
 #include "NeonPooling2dWorkload.hpp"
-#include <neon/NeonLayerSupport.hpp>
+
+#include "NeonWorkloadUtils.hpp"
+
 #include <neon/NeonTensorHandle.hpp>
 #include <aclCommon/ArmComputeUtils.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
+
 namespace armnn
 {
 using namespace armcomputetensorutils;
@@ -42,13 +46,15 @@
 
     arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters);
 
-    m_PoolingLayer.configure(&input, &output, layerInfo);
+    auto layer = std::make_unique<arm_compute::NEPoolingLayer>();
+    layer->configure(&input, &output, layerInfo);
+    m_PoolingLayer.reset(layer.release());
 }
 
 void NeonPooling2dWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dWorkload_Execute");
-    m_PoolingLayer.run();
+    m_PoolingLayer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonPooling2dWorkload.hpp b/src/backends/neon/workloads/NeonPooling2dWorkload.hpp
index b2379f7..b0e3aa8 100644
--- a/src/backends/neon/workloads/NeonPooling2dWorkload.hpp
+++ b/src/backends/neon/workloads/NeonPooling2dWorkload.hpp
@@ -5,7 +5,12 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
+
+#include <memory>
 
 namespace armnn
 {
@@ -24,7 +29,7 @@
     void Execute() const override;
 
 private:
-    mutable arm_compute::NEPoolingLayer m_PoolingLayer;
+    std::unique_ptr<arm_compute::IFunction> m_PoolingLayer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonReshapeWorkload.cpp b/src/backends/neon/workloads/NeonReshapeWorkload.cpp
index c2dcdd5..40fbef6 100644
--- a/src/backends/neon/workloads/NeonReshapeWorkload.cpp
+++ b/src/backends/neon/workloads/NeonReshapeWorkload.cpp
@@ -5,6 +5,12 @@
 
 #include "NeonReshapeWorkload.hpp"
 
+#include "NeonWorkloadUtils.hpp"
+
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+
+#include <boost/polymorphic_cast.hpp>
+
 namespace armnn
 {
 
@@ -17,13 +23,15 @@
     arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(&input, &output);
+    auto layer = std::make_unique<arm_compute::NEReshapeLayer>();
+    layer->configure(&input, &output);
+    m_Layer.reset(layer.release());
 }
 
 void NeonReshapeWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeWorkload_Execute");
-    m_Layer.run();
+    m_Layer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonReshapeWorkload.hpp b/src/backends/neon/workloads/NeonReshapeWorkload.hpp
index 38b6c51..2202463 100644
--- a/src/backends/neon/workloads/NeonReshapeWorkload.hpp
+++ b/src/backends/neon/workloads/NeonReshapeWorkload.hpp
@@ -5,7 +5,11 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/runtime/IFunction.h>
+
+#include <memory>
 
 namespace armnn
 {
@@ -18,7 +22,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NEReshapeLayer m_Layer;
+    std::unique_ptr<arm_compute::IFunction> m_Layer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.cpp b/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.cpp
index 434de87..b229bc4 100644
--- a/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.cpp
@@ -7,6 +7,8 @@
 
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NESoftmaxLayer.h>
+
 namespace armnn
 {
 
diff --git a/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.hpp b/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.hpp
index 6e96c2d..6eecb97 100644
--- a/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.hpp
+++ b/src/backends/neon/workloads/NeonSoftmaxBaseWorkload.hpp
@@ -5,7 +5,8 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <armnn/Descriptors.hpp>
+#include <arm_compute/core/Error.h>
 
 namespace armnn
 {
diff --git a/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp
index 92e5139..d9c78bb 100644
--- a/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.cpp
@@ -5,13 +5,16 @@
 
 #include "NeonSoftmaxFloatWorkload.hpp"
 
+#include "NeonWorkloadUtils.hpp"
+
+#include <arm_compute/runtime/NEON/functions/NESoftmaxLayer.h>
+
 namespace armnn
 {
 
 NeonSoftmaxFloatWorkload::NeonSoftmaxFloatWorkload(const SoftmaxQueueDescriptor& descriptor,
     const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
     : FloatWorkload<SoftmaxQueueDescriptor>(descriptor, info)
-    , m_SoftmaxLayer(memoryManager)
 {
     m_Data.ValidateInputsOutputs("NeonSoftmaxFloatWorkload", 1, 1);
 
@@ -19,13 +22,15 @@
     arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_SoftmaxLayer.configure(&input, &output, m_Data.m_Parameters.m_Beta);
+    auto layer = std::make_unique<arm_compute::NESoftmaxLayer>(memoryManager);
+    layer->configure(&input, &output, m_Data.m_Parameters.m_Beta);
+    m_SoftmaxLayer.reset(layer.release());
 }
 
 void NeonSoftmaxFloatWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxFloatWorkload_Execute");
-    m_SoftmaxLayer.run();
+    m_SoftmaxLayer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.hpp b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.hpp
index 9c11b27..77f2cc3 100644
--- a/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonSoftmaxFloatWorkload.hpp
@@ -5,7 +5,9 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/runtime/IFunction.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
 
 #include <memory>
@@ -21,7 +23,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NESoftmaxLayer m_SoftmaxLayer;
+    std::unique_ptr<arm_compute::IFunction> m_SoftmaxLayer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp
index cff869c..f780589 100644
--- a/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp
+++ b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.cpp
@@ -5,6 +5,10 @@
 
 #include "NeonSoftmaxUint8Workload.hpp"
 
+#include "NeonWorkloadUtils.hpp"
+
+#include <arm_compute/runtime/NEON/functions/NESoftmaxLayer.h>
+
 namespace armnn
 {
 
@@ -12,7 +16,6 @@
                                                    const WorkloadInfo& info,
                                                    std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
     : Uint8Workload<SoftmaxQueueDescriptor>(descriptor, info)
-    , m_SoftmaxLayer(memoryManager)
 {
     m_Data.ValidateInputsOutputs("NeonSoftmaxUint8Workload", 1, 1);
 
@@ -27,14 +30,16 @@
             "Invalid quantization for output. Only scale = 1.0f / 256.0f and offset = 0 supported");
     }
 
-    m_SoftmaxLayer.configure(&input, &output, descriptor.m_Parameters.m_Beta);
+    auto layer = std::make_unique<arm_compute::NESoftmaxLayer>(memoryManager);
+    layer->configure(&input, &output, descriptor.m_Parameters.m_Beta);
+    m_SoftmaxLayer.reset(layer.release());
 }
 
 void NeonSoftmaxUint8Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxUint8Workload_Execute");
 
-    m_SoftmaxLayer.run();
+    m_SoftmaxLayer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSoftmaxUint8Workload.hpp b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.hpp
index b3bcbf3..c569208 100644
--- a/src/backends/neon/workloads/NeonSoftmaxUint8Workload.hpp
+++ b/src/backends/neon/workloads/NeonSoftmaxUint8Workload.hpp
@@ -5,9 +5,13 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/runtime/IFunction.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
 
+#include <memory>
+
 namespace armnn
 {
 
@@ -19,7 +23,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NESoftmaxLayer m_SoftmaxLayer;
+    std::unique_ptr<arm_compute::IFunction> m_SoftmaxLayer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSubtractionFloatWorkload.cpp b/src/backends/neon/workloads/NeonSubtractionFloatWorkload.cpp
index 1eae0a4..e39f8aa 100644
--- a/src/backends/neon/workloads/NeonSubtractionFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonSubtractionFloatWorkload.cpp
@@ -4,9 +4,13 @@
 //
 
 #include "NeonSubtractionFloatWorkload.hpp"
+
+#include "NeonWorkloadUtils.hpp"
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 
+#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h>
+
 namespace armnn
 {
 
@@ -34,13 +38,15 @@
     arm_compute::ITensor& input2 = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_SubLayer.configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE);
+    auto layer = std::make_unique<arm_compute::NEArithmeticSubtraction>();
+    layer->configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE);
+    m_SubLayer.reset(layer.release());
 }
 
 void NeonSubtractionFloatWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSubtractionFloatWorkload_Execute");
-    m_SubLayer.run();
+    m_SubLayer->run();
 }
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonSubtractionFloatWorkload.hpp b/src/backends/neon/workloads/NeonSubtractionFloatWorkload.hpp
index 0901699..5dce112 100644
--- a/src/backends/neon/workloads/NeonSubtractionFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonSubtractionFloatWorkload.hpp
@@ -5,7 +5,12 @@
 
 #pragma once
 
-#include <neon/workloads/NeonWorkloadUtils.hpp>
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/IFunction.h>
+
+#include <memory>
 
 namespace armnn
 {
@@ -21,7 +26,7 @@
     virtual void Execute() const override;
 
 private:
-    mutable arm_compute::NEArithmeticSubtraction m_SubLayer;
+    std::unique_ptr<arm_compute::IFunction> m_SubLayer;
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonWorkloadUtils.hpp b/src/backends/neon/workloads/NeonWorkloadUtils.hpp
index 17e14cd..22ffece 100644
--- a/src/backends/neon/workloads/NeonWorkloadUtils.hpp
+++ b/src/backends/neon/workloads/NeonWorkloadUtils.hpp
@@ -9,7 +9,6 @@
 #include <neon/NeonTensorHandle.hpp>
 #include <neon/NeonTimer.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
-#include <arm_compute/runtime/NEON/NEFunctions.h>
 
 #include <Half.hpp>