Refactor: Don't include all ComputeLibrary function definitions everywhere.

Just include the function definition that is specifically needed for each workload.
Also, tighten up the scope where Compute Library functions are available.

Knocks about 30seconds off a 4m30s single-threaded compile of the Neon workloads.

Change-Id: Idac438f3bc77ff978295fbc9505cb42447def145
diff --git a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
index 44d5035..fc80f41 100644
--- a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
+++ b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp
@@ -4,9 +4,13 @@
 //
 
 #include "NeonBatchNormalizationWorkload.hpp"
+
+#include "NeonWorkloadUtils.hpp"
+
 #include <backendsCommon/CpuTensorHandle.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
-#include <armnn/ArmNN.hpp>
+
+#include <arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h>
 
 namespace armnn
 {
@@ -68,13 +72,15 @@
     m_Beta = std::make_unique<arm_compute::Tensor>();
     BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo());
 
-    m_Layer.configure(&input,
-                      &output,
-                      m_Mean.get(),
-                      m_Variance.get(),
-                      m_Beta.get(),
-                      m_Gamma.get(),
-                      m_Data.m_Parameters.m_Eps);
+    auto layer = std::make_unique<arm_compute::NEBatchNormalizationLayer>();
+    layer->configure(&input,
+                     &output,
+                     m_Mean.get(),
+                     m_Variance.get(),
+                     m_Beta.get(),
+                     m_Gamma.get(),
+                     m_Data.m_Parameters.m_Eps);
+    m_Layer.reset(layer.release());
 
     InitializeArmComputeTensorData(*m_Mean, m_Data.m_Mean);
     InitializeArmComputeTensorData(*m_Variance, m_Data.m_Variance);
@@ -83,14 +89,14 @@
 
     // Force Compute Library to perform the necessary copying and reshaping, after which
     // delete all the input tensors that will no longer be needed
-    m_Layer.prepare();
+    m_Layer->prepare();
     FreeUnusedTensors();
 }
 
 void NeonBatchNormalizationWorkload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonBatchNormalizationWorkload_Execute");
-    m_Layer.run();
+    m_Layer->run();
 }
 
 void NeonBatchNormalizationWorkload::FreeUnusedTensors()