Typo in Guide

* Only file changed is shim/BuildGuideShimSupportLibrary.md
* All other files are merge squash from Arm NN 22.02

Signed-off-by: Kevin May <kevin.may@arm.com>
Change-Id: Id82a6e9ac8abf74c1073c08744712f50e98dece0
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 385affa..fc48ffc 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -583,7 +583,6 @@
 void MemSyncQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
     ValidateNumInputs(workloadInfo, "MemSyncQueueDescriptor", 1);
-    ValidateNumOutputs(workloadInfo, "MemSyncQueueDescriptor" , 1);
 
     if (m_Inputs.size() != 1)
     {
diff --git a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
index c69a4a5..77901df 100644
--- a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
+++ b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
@@ -951,11 +951,12 @@
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
 
-    auto inputBuffer = reinterpret_cast<float*>(misalignedMemPtr);
-    for (int i = 0; i < 4; i++)
+    std::vector<float> inputData
     {
-        inputBuffer[i] = 1.0f + static_cast<float>(i);
-    }
+         1.0f, 2.0f, 3.0f, 4.0f
+    };
+
+    std::memcpy(misalignedMemPtr, inputData.data(), 4*sizeof(float));
 
     std::vector<float> outputData(4);
     // Check our output buffer is aligned
@@ -1129,9 +1130,11 @@
         // Check the output is correct
     }
     unsigned int index = 0;
+    std::vector<float> outputData(expectedOutput.size(), 0);
+    std::memcpy(outputData.data(), misalignedMemPtr, expectedOutput.size() * sizeof(float));
     for (auto outputValue : expectedOutput)
     {
-        CHECK(outputValue == reinterpret_cast<float*>(misalignedMemPtr)[index]);
+        CHECK(outputValue == outputData[index]);
         ++index;
     }
     std::free(memPtr);
@@ -1183,11 +1186,11 @@
     // Check if our pointer is truly misaligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
-    auto inputBuffer = reinterpret_cast<float*>(misalignedInputPtr);
-    for (int i = 0; i < 4; i++)
+    std::vector<float> inputData
     {
-        inputBuffer[i] = 1.0f + static_cast<float>(i);
-    }
+         1.0f, 2.0f, 3.0f, 4.0f
+    };
+    std::memcpy(misalignedInputPtr, inputData.data(), 4*sizeof(float));
 
     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
@@ -1238,9 +1241,11 @@
     }
     // Check the output is correct
     unsigned int index = 0;
-    for (auto outputValue : expectedOutput)
+    std::vector<float> outputData(expectedOutput.size(), 0);
+    std::memcpy(outputData.data(), misalignedOutputPtr, expectedOutput.size() * sizeof(float));
+    for (auto expectedValue : expectedOutput)
     {
-        CHECK(outputValue == reinterpret_cast<float*>(misalignedOutputPtr)[index]);
+        CHECK(expectedValue == outputData[index]);
         ++index;
     }
     std::free(inputMemPtr);
@@ -1356,11 +1361,13 @@
 
     // Check if our pointer is truly misaligned
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
-    auto inputBuffer = reinterpret_cast<float*>(misalignedInputPtr);
-    for (int i = 0; i < 4; i++)
+
+    std::vector<float> inputValues
     {
-        inputBuffer[i] = 2.0f + static_cast<float>(i);
-    }
+         2.0f, 3.0f, 4.0f, 5.0f
+    };
+
+    std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size()*sizeof(float));
 
     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
@@ -1411,9 +1418,11 @@
     }
     // Check the output is correct
     unsigned int index = 0;
+    std::vector<float> alignedOutputData(expectedMisalignedOutput.size(), 0);
+    std::memcpy(alignedOutputData.data(), misalignedOutputPtr, expectedMisalignedOutput.size() * sizeof(float));
     for (auto outputValue : expectedMisalignedOutput)
     {
-        CHECK(outputValue == reinterpret_cast<float*>(misalignedOutputPtr)[index]);
+        CHECK(outputValue == alignedOutputData[index]);
         ++index;
     }
     // Clean up to avoid interfering with other tests
@@ -1471,11 +1480,11 @@
     // Check if our pointer is truly misaligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
-    auto inputBuffer = reinterpret_cast<float*>(misalignedInputPtr);
-    for (int i = 0; i < 4; i++)
+    std::vector<float> inputValues
     {
-        inputBuffer[i] = 2.0f + static_cast<float>(i);
-    }
+         2.0f, 3.0f, 4.0f, 5.0f
+    };
+    std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size() * sizeof(float));
 
     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
@@ -1530,9 +1539,11 @@
     }
     // Check the output is correct
     unsigned int index = 0;
+    std::vector<float> alignedOutput(expectedMisalignedOutput.size());
+    std::memcpy(alignedOutput.data(), misalignedOutputPtr, expectedMisalignedOutput.size()*sizeof(float));
     for (auto outputValue : expectedMisalignedOutput)
     {
-        CHECK(outputValue == reinterpret_cast<float*>(misalignedOutputPtr)[index]);
+        CHECK(outputValue == alignedOutput[index]);
         ++index;
     }
     std::free(inputMemPtr);
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
index 992abc2..389605f 100644
--- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
@@ -124,4 +124,42 @@
     FreeTensorIfUnused(m_Beta);
 }
 
+void ClBatchNormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClBatchNormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClBatchNormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
index dc76703..d476636 100644
--- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
@@ -32,6 +32,12 @@
     using FloatWorkload<BatchNormalizationQueueDescriptor>::FloatWorkload;
     void Execute() const override;
 
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
 private:
     mutable arm_compute::CLBatchNormalizationLayer m_Layer;
 
@@ -41,6 +47,7 @@
     std::unique_ptr<arm_compute::CLTensor> m_Beta;
 
     void FreeUnusedTensors();
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
index 867770a..8ccf157 100644
--- a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
@@ -25,9 +25,13 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
+    // Create Proxy tensor and set the initial tensor handle to it
+    m_InputProxy = std::make_unique<ICLTensorProxy>(&input);
+    m_OutputProxy = std::make_unique<ICLTensorProxy>(&output);
+
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvertFp16ToFp32Workload_configure");
-        m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
+        m_Layer.configure(clCompileContext, m_InputProxy.get(), m_OutputProxy.get(), g_AclConvertPolicy, 0);
     }
 }
 
@@ -57,5 +61,45 @@
     return aclStatus;
 }
 
+void ClConvertFp16ToFp32Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClConvertFp16ToFp32Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClConvertFp16ToFp32Workload::Reconfigure()
+{
+    arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+    m_InputProxy->set(&input);
+    m_OutputProxy->set(&output);
+}
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
index b392c0b..3c6fcd6 100644
--- a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
@@ -9,6 +9,8 @@
 
 #include <arm_compute/runtime/CL/functions/CLDepthConvertLayer.h>
 
+#include <cl/ICLTensorProxy.hpp>
+
 namespace armnn
 {
 
@@ -21,8 +23,19 @@
                                 const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
+    bool SupportsTensorHandleReplacement() const override { return true;};
+
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLDepthConvertLayer m_Layer;
+    virtual void Reconfigure();
+
+    std::unique_ptr<ICLTensorProxy> m_InputProxy;
+    std::unique_ptr<ICLTensorProxy> m_OutputProxy;
 };
 
 arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, const TensorInfo& output);
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
index 017fcaf..a44a80c 100644
--- a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
@@ -25,9 +25,13 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
+    // Create Proxy tensor and set the initial tensor handle to it
+    m_InputProxy = std::make_unique<ICLTensorProxy>(&input);
+    m_OutputProxy = std::make_unique<ICLTensorProxy>(&output);
+
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvertFp32ToFp16Workload_configure");
-        m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
+        m_Layer.configure(clCompileContext, m_InputProxy.get(), m_OutputProxy.get(), g_AclConvertPolicy, 0);
     }
 }
 
@@ -57,5 +61,45 @@
     return aclStatus;
 }
 
+void ClConvertFp32ToFp16Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClConvertFp32ToFp16Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClConvertFp32ToFp16Workload::Reconfigure()
+{
+    arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+    m_InputProxy->set(&input);
+    m_OutputProxy->set(&output);
+}
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
index 1d777b5..6ce563e 100644
--- a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
@@ -9,6 +9,8 @@
 
 #include <arm_compute/runtime/CL/functions/CLDepthConvertLayer.h>
 
+#include <cl/ICLTensorProxy.hpp>
+
 namespace armnn
 {
 
@@ -21,8 +23,19 @@
                                 const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
+    bool SupportsTensorHandleReplacement() const override { return true;};
+
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLDepthConvertLayer m_Layer;
+    virtual void Reconfigure();
+
+    std::unique_ptr<ICLTensorProxy> m_InputProxy;
+    std::unique_ptr<ICLTensorProxy> m_OutputProxy;
 };
 
 arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, const TensorInfo& output);
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
index cdfa885..bf82fbf 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
@@ -180,9 +180,9 @@
 
 void ClConvolution2dWorkload::Reconfigure()
 {
-    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_Reconfigure");
     arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
     m_InputProxy->set(&input);
     m_OutputProxy->set(&output);
 }
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
index 891d509..e4177e4 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
@@ -40,6 +40,8 @@
 
     arm_compute::ConvolutionMethod GetConvolutionMethod() const;
 
+    bool SupportsTensorHandleReplacement() const override { return true;};
+
 protected:
     void Reconfigure() override;
 
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
index 679e225..0aae1a3 100644
--- a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
@@ -29,7 +29,6 @@
 
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
-
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClFloorFloatWorkload_configure");
         m_Layer.configure(clCompileContext, &input, &output);
@@ -42,4 +41,42 @@
     RunClFunction(m_Layer, CHECK_LOCATION());
 }
 
+void ClFloorFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClFloorFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClFloorFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.hpp b/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
index 5740c68..dbe5f6f 100644
--- a/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
@@ -23,9 +23,14 @@
                          const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLFloor m_Layer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
index b34153f..d120fb2 100644
--- a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
@@ -60,4 +60,42 @@
     RunClFunction(m_Layer, CHECK_LOCATION());
 }
 
+void ClL2NormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClL2NormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClL2NormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
index cfa1a97..67e7b8b 100644
--- a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
@@ -24,10 +24,16 @@
                                    const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
 private:
     // Purposely not a CLL2Normalize function. See constructor.
     mutable arm_compute::CLL2NormalizeLayer m_Layer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
index d8d95f5..37dfab6 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
@@ -446,4 +446,42 @@
     FreeTensorIfUnused(m_OutputLayerNormWeightsTensor);
 }
 
+void ClLstmFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClLstmFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClLstmFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.hpp b/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
index b9faca8..54c5c60 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
@@ -22,9 +22,14 @@
                         const WorkloadInfo& info,
                         const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLLSTMLayer m_LstmLayer;
+    virtual void Reconfigure();
 
     std::unique_ptr<arm_compute::CLTensor> m_InputToInputWeightsTensor;
     std::unique_ptr<arm_compute::CLTensor> m_InputToForgetWeightsTensor;
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
index d98532d..8de8dd5 100644
--- a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
@@ -62,4 +62,42 @@
     RunClFunction(m_NormalizationLayer, CHECK_LOCATION());
 }
 
+void ClNormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClNormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClNormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
index 40b2693..d9db0f2 100644
--- a/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
@@ -23,9 +23,14 @@
                                  const WorkloadInfo& info,
                                  const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLNormalizationLayer    m_NormalizationLayer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/NeonTimer.cpp b/src/backends/neon/NeonTimer.cpp
index 5cce051..a7d3032 100644
--- a/src/backends/neon/NeonTimer.cpp
+++ b/src/backends/neon/NeonTimer.cpp
@@ -7,6 +7,7 @@
 #include "NeonInterceptorScheduler.hpp"
 
 #include <armnn/utility/Assert.hpp>
+#include <armnn/utility/PolymorphicDowncast.hpp>
 
 #include <memory>
 
@@ -29,7 +30,7 @@
     {
         // Keep the real schedule and add NeonInterceptorScheduler as an interceptor
         m_RealScheduler  = &arm_compute::Scheduler::get();
-        arm_compute::Scheduler::set(std::static_pointer_cast<arm_compute::IScheduler>(g_Interceptor));
+        arm_compute::Scheduler::set(armnn::PolymorphicPointerDowncast<arm_compute::IScheduler>(g_Interceptor));
     }
 }
 
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
index dcef025..7a2ff9a 100644
--- a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
@@ -40,4 +40,42 @@
     }
 }
 
+void NeonConvertBf16ToFp32Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonConvertBf16ToFp32Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonConvertBf16ToFp32Workload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
index 9770fbd..9d44ad2 100644
--- a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
@@ -17,10 +17,15 @@
 public:
     NeonConvertBf16ToFp32Workload(const ConvertBf16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
index 1b9e1bc..ce6c785 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
@@ -40,4 +40,42 @@
     }
 }
 
+void NeonConvertFp16ToFp32Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonConvertFp16ToFp32Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonConvertFp16ToFp32Workload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
index 9159e51..c0165ea 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
@@ -17,10 +17,15 @@
 public:
     NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
index ac6a69d..acd1a1e 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
@@ -41,4 +41,42 @@
     }
 }
 
+void NeonConvertFp32ToBf16Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonConvertFp32ToBf16Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonConvertFp32ToBf16Workload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
index 6c01187..2304f8a 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
@@ -17,10 +17,15 @@
 public:
     NeonConvertFp32ToBf16Workload(const ConvertFp32ToBf16QueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
index d65cba0..089716a 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
@@ -41,4 +41,42 @@
     }
 }
 
+void NeonConvertFp32ToFp16Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonConvertFp32ToFp16Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonConvertFp32ToFp16Workload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
index 8e9f11b..666f487 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
@@ -17,10 +17,15 @@
 public:
     NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
index b97e3ce..1d53245 100644
--- a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
@@ -32,6 +32,45 @@
     ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonFloorFloatWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
+
+void NeonFloorFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonFloorFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonFloorFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
 
 
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
index 7113931..8ba6b4a 100644
--- a/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
@@ -20,9 +20,14 @@
 public:
     NeonFloorFloatWorkload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     std::unique_ptr<arm_compute::IFunction> m_Layer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
index 887f25a..c0c6ed4 100644
--- a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
@@ -60,4 +60,42 @@
     m_Layer->run();
 }
 
+void NeonL2NormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonL2NormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonL2NormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
index 82f0639..9c591fc 100644
--- a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
@@ -26,9 +26,14 @@
     NeonL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info,
                                      std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     std::unique_ptr<arm_compute::IFunction> m_Layer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
index b8224e6..2f14ab9 100644
--- a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
@@ -464,4 +464,42 @@
     FreeTensorIfUnused(m_OutputLayerNormWeightsTensor);
 }
 
+void NeonLstmFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonLstmFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonLstmFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp
index ebbf180..4bb3ff8 100644
--- a/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp
@@ -21,7 +21,11 @@
 public:
     NeonLstmFloatWorkload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::NELSTMLayer m_LstmLayer;
 
@@ -51,6 +55,7 @@
     std::unique_ptr<arm_compute::Tensor> m_OutputLayerNormWeightsTensor;
 
     void FreeUnusedTensors();
+    virtual void Reconfigure();
 };
 
 arm_compute::Status NeonLstmFloatWorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn,
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
index f811a04..01ac5c1 100644
--- a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
@@ -110,4 +110,42 @@
     m_NormalizationLayer->run();
 }
 
+void NeonNormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonNormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonNormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
index ed54536..9605ed1 100644
--- a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
@@ -26,9 +26,14 @@
     NeonNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info,
                                    std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     std::unique_ptr<arm_compute::IFunction> m_NormalizationLayer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/test/RefWorkloadFactoryHelper.hpp b/src/backends/reference/test/RefWorkloadFactoryHelper.hpp
index e413d04..f0a842d 100644
--- a/src/backends/reference/test/RefWorkloadFactoryHelper.hpp
+++ b/src/backends/reference/test/RefWorkloadFactoryHelper.hpp
@@ -7,6 +7,8 @@
 
 #include <backendsCommon/test/WorkloadFactoryHelper.hpp>
 
+#include <armnn/utility/PolymorphicDowncast.hpp>
+
 #include <reference/RefBackend.hpp>
 #include <reference/RefWorkloadFactory.hpp>
 #include "reference/RefTensorHandleFactory.hpp"
@@ -34,7 +36,7 @@
             const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr)
     {
 
-        return armnn::RefTensorHandleFactory(std::static_pointer_cast<armnn::RefMemoryManager>(memoryManager));
+        return armnn::RefTensorHandleFactory(armnn::PolymorphicPointerDowncast<armnn::RefMemoryManager>(memoryManager));
     }
 };
 
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 60d8255..46c2706 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -68,6 +68,7 @@
     RefActivationWorkload.hpp
     RefArgMinMaxWorkload.cpp
     RefArgMinMaxWorkload.hpp
+    RefBaseWorkload.hpp
     RefBatchNormalizationWorkload.cpp
     RefBatchNormalizationWorkload.hpp
     RefBatchToSpaceNdWorkload.cpp
diff --git a/src/backends/reference/workloads/RefActivationWorkload.hpp b/src/backends/reference/workloads/RefActivationWorkload.hpp
index 9814ac1..8dc2d52 100644
--- a/src/backends/reference/workloads/RefActivationWorkload.hpp
+++ b/src/backends/reference/workloads/RefActivationWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefActivationWorkload : public BaseWorkload<ActivationQueueDescriptor>
+class RefActivationWorkload : public RefBaseWorkload<ActivationQueueDescriptor>
 {
 public:
-    using BaseWorkload<ActivationQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ActivationQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor) override;
 
diff --git a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
index 2d635bf..d724273 100644
--- a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
+++ b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
@@ -16,7 +16,7 @@
 RefArgMinMaxWorkload::RefArgMinMaxWorkload(
         const ArgMinMaxQueueDescriptor& descriptor,
         const WorkloadInfo& info)
-        : BaseWorkload<ArgMinMaxQueueDescriptor>(descriptor, info) {}
+        : RefBaseWorkload<ArgMinMaxQueueDescriptor>(descriptor, info) {}
 
 
 void RefArgMinMaxWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp b/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
index f3c2644..97c4b45 100644
--- a/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
@@ -5,12 +5,12 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
-class RefArgMinMaxWorkload : public BaseWorkload<ArgMinMaxQueueDescriptor>
+class RefArgMinMaxWorkload : public RefBaseWorkload<ArgMinMaxQueueDescriptor>
 {
 public:
     explicit RefArgMinMaxWorkload(const ArgMinMaxQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefBaseWorkload.hpp b/src/backends/reference/workloads/RefBaseWorkload.hpp
new file mode 100644
index 0000000..824b4cc
--- /dev/null
+++ b/src/backends/reference/workloads/RefBaseWorkload.hpp
@@ -0,0 +1,36 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/backends/Workload.hpp>
+
+namespace armnn
+{
+    template <typename QueueDescriptor>
+    class RefBaseWorkload : public BaseWorkload<QueueDescriptor>
+    {
+    public:
+        RefBaseWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info)
+                : BaseWorkload<QueueDescriptor>(descriptor, info)
+        {}
+
+        virtual bool SupportsTensorHandleReplacement()  const override
+        {
+            return true;
+        }
+        // Replace input tensor handle with the given TensorHandle
+        void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
+        {
+            this->m_Data.m_Inputs[slot] = tensorHandle;
+        }
+
+        // Replace output tensor handle with the given TensorHandle
+        void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
+        {
+            this->m_Data.m_Outputs[slot] = tensorHandle;
+        }
+    };
+} //namespace armnn
\ No newline at end of file
diff --git a/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp b/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
index 282374d..a6bd986 100644
--- a/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefBatchNormalizationWorkload::RefBatchNormalizationWorkload(const BatchNormalizationQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
     , m_Mean    (std::make_unique<ScopedTensorHandle>(*(descriptor.m_Mean)))
     , m_Variance(std::make_unique<ScopedTensorHandle>(*(descriptor.m_Variance)))
     , m_Beta    (std::make_unique<ScopedTensorHandle>(*(descriptor.m_Beta)))
diff --git a/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp b/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
index 305c0ce..60dd2a9 100644
--- a/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefBatchNormalizationWorkload : public BaseWorkload<BatchNormalizationQueueDescriptor>
+class RefBatchNormalizationWorkload : public RefBaseWorkload<BatchNormalizationQueueDescriptor>
 {
 public:
     explicit RefBatchNormalizationWorkload(const BatchNormalizationQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
index 7d18c12..d7ee6fc 100644
--- a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
+++ b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn {
 
-class RefBatchToSpaceNdWorkload : public BaseWorkload<BatchToSpaceNdQueueDescriptor>
+class RefBatchToSpaceNdWorkload : public RefBaseWorkload<BatchToSpaceNdQueueDescriptor>
 {
 
 public:
-    using BaseWorkload<BatchToSpaceNdQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<BatchToSpaceNdQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefCastWorkload.hpp b/src/backends/reference/workloads/RefCastWorkload.hpp
index ccafaaf..6f7e56a 100644
--- a/src/backends/reference/workloads/RefCastWorkload.hpp
+++ b/src/backends/reference/workloads/RefCastWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "RefWorkloadUtils.hpp"
 
@@ -13,10 +13,10 @@
 {
 
 
-class RefCastWorkload : public BaseWorkload<CastQueueDescriptor>
+class RefCastWorkload : public RefBaseWorkload<CastQueueDescriptor>
 {
 public:
-    using BaseWorkload<CastQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<CastQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefChannelShuffleWorkload.hpp b/src/backends/reference/workloads/RefChannelShuffleWorkload.hpp
index 0c80378..b459b87 100644
--- a/src/backends/reference/workloads/RefChannelShuffleWorkload.hpp
+++ b/src/backends/reference/workloads/RefChannelShuffleWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefChannelShuffleWorkload : public BaseWorkload<ChannelShuffleQueueDescriptor>
+class RefChannelShuffleWorkload : public RefBaseWorkload<ChannelShuffleQueueDescriptor>
 {
 public:
-    using BaseWorkload<ChannelShuffleQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ChannelShuffleQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor) override;
 
diff --git a/src/backends/reference/workloads/RefComparisonWorkload.cpp b/src/backends/reference/workloads/RefComparisonWorkload.cpp
index 03df7a4..433e3e8 100644
--- a/src/backends/reference/workloads/RefComparisonWorkload.cpp
+++ b/src/backends/reference/workloads/RefComparisonWorkload.cpp
@@ -21,7 +21,7 @@
 
 RefComparisonWorkload::RefComparisonWorkload(const ComparisonQueueDescriptor& desc,
                                              const WorkloadInfo& info)
-    : BaseWorkload<ComparisonQueueDescriptor>(desc, info)
+    : RefBaseWorkload<ComparisonQueueDescriptor>(desc, info)
 {}
 
 void RefComparisonWorkload::PostAllocationConfigure()
diff --git a/src/backends/reference/workloads/RefComparisonWorkload.hpp b/src/backends/reference/workloads/RefComparisonWorkload.hpp
index f2780c7..93cfd1f 100644
--- a/src/backends/reference/workloads/RefComparisonWorkload.hpp
+++ b/src/backends/reference/workloads/RefComparisonWorkload.hpp
@@ -7,16 +7,16 @@
 
 #include "BaseIterator.hpp"
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefComparisonWorkload : public BaseWorkload<ComparisonQueueDescriptor>
+class RefComparisonWorkload : public RefBaseWorkload<ComparisonQueueDescriptor>
 {
 public:
-    using BaseWorkload<ComparisonQueueDescriptor>::m_Data;
+    using RefBaseWorkload<ComparisonQueueDescriptor>::m_Data;
 
     RefComparisonWorkload(const ComparisonQueueDescriptor& descriptor, const WorkloadInfo& info);
     void PostAllocationConfigure() override;
diff --git a/src/backends/reference/workloads/RefConcatWorkload.hpp b/src/backends/reference/workloads/RefConcatWorkload.hpp
index cb1ecf0..11d6d01 100644
--- a/src/backends/reference/workloads/RefConcatWorkload.hpp
+++ b/src/backends/reference/workloads/RefConcatWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefConcatWorkload : public BaseWorkload<ConcatQueueDescriptor>
+class RefConcatWorkload : public RefBaseWorkload<ConcatQueueDescriptor>
 {
 public:
-    using BaseWorkload<ConcatQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ConcatQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefConstantWorkload.cpp b/src/backends/reference/workloads/RefConstantWorkload.cpp
index 6290237..571dbb2 100644
--- a/src/backends/reference/workloads/RefConstantWorkload.cpp
+++ b/src/backends/reference/workloads/RefConstantWorkload.cpp
@@ -18,7 +18,7 @@
 
 RefConstantWorkload::RefConstantWorkload(
     const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info)
-    : BaseWorkload<ConstantQueueDescriptor>(descriptor, info) {}
+    : RefBaseWorkload<ConstantQueueDescriptor>(descriptor, info) {}
 
 void RefConstantWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefConstantWorkload.hpp b/src/backends/reference/workloads/RefConstantWorkload.hpp
index c158983..181d79d 100644
--- a/src/backends/reference/workloads/RefConstantWorkload.hpp
+++ b/src/backends/reference/workloads/RefConstantWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include <armnn/Types.hpp>
@@ -14,7 +14,7 @@
 {
 
 // Base class template providing an implementation of the Constant layer common to all data types.
-class RefConstantWorkload : public BaseWorkload<ConstantQueueDescriptor>
+class RefConstantWorkload : public RefBaseWorkload<ConstantQueueDescriptor>
 {
 public:
     RefConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
index b3af111..8b5c6d5 100644
--- a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
index acb1995..feb442e 100644
--- a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp b/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
index 97a138f..cd3cfa4 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
index 8cc822e..fe137ed 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
index 20c5c08..d57040e 100644
--- a/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
@@ -14,7 +14,7 @@
 {
 RefConvolution2dWorkload::RefConvolution2dWorkload(
     const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
-    : BaseWorkload<Convolution2dQueueDescriptor>(descriptor, info)
+    : RefBaseWorkload<Convolution2dQueueDescriptor>(descriptor, info)
 {
     WorkloadInfo detailsInfo;
     detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
diff --git a/src/backends/reference/workloads/RefConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
index 880547d..3335782 100644
--- a/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
@@ -13,7 +13,7 @@
 namespace armnn
 {
 
-class RefConvolution2dWorkload : public BaseWorkload<Convolution2dQueueDescriptor>
+class RefConvolution2dWorkload : public RefBaseWorkload<Convolution2dQueueDescriptor>
 {
 public:
     explicit RefConvolution2dWorkload(const Convolution2dQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefConvolution3dWorkload.cpp b/src/backends/reference/workloads/RefConvolution3dWorkload.cpp
index afab88f..5f54280 100644
--- a/src/backends/reference/workloads/RefConvolution3dWorkload.cpp
+++ b/src/backends/reference/workloads/RefConvolution3dWorkload.cpp
@@ -14,7 +14,7 @@
 {
 RefConvolution3dWorkload::RefConvolution3dWorkload(
     const Convolution3dQueueDescriptor& descriptor, const WorkloadInfo& info)
-    : BaseWorkload<Convolution3dQueueDescriptor>(descriptor, info)
+    : RefBaseWorkload<Convolution3dQueueDescriptor>(descriptor, info)
 {
     WorkloadInfo detailsInfo;
     detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
diff --git a/src/backends/reference/workloads/RefConvolution3dWorkload.hpp b/src/backends/reference/workloads/RefConvolution3dWorkload.hpp
index 53ce309..6c74675 100644
--- a/src/backends/reference/workloads/RefConvolution3dWorkload.hpp
+++ b/src/backends/reference/workloads/RefConvolution3dWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
@@ -13,7 +13,7 @@
 namespace armnn
 {
 
-class RefConvolution3dWorkload : public BaseWorkload<Convolution3dQueueDescriptor>
+class RefConvolution3dWorkload : public RefBaseWorkload<Convolution3dQueueDescriptor>
 {
 public:
     explicit RefConvolution3dWorkload(const Convolution3dQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefDebugWorkload.hpp b/src/backends/reference/workloads/RefDebugWorkload.hpp
index 66af9a0..a157959 100644
--- a/src/backends/reference/workloads/RefDebugWorkload.hpp
+++ b/src/backends/reference/workloads/RefDebugWorkload.hpp
@@ -7,7 +7,7 @@
 
 #include <armnn/TypesUtils.hpp>
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
diff --git a/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp b/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
index 854a564..bd179d3 100644
--- a/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
+++ b/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
@@ -5,15 +5,15 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
 
-class RefDepthToSpaceWorkload : public BaseWorkload<DepthToSpaceQueueDescriptor>
+class RefDepthToSpaceWorkload : public RefBaseWorkload<DepthToSpaceQueueDescriptor>
 {
 public:
-    using BaseWorkload<DepthToSpaceQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<DepthToSpaceQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
index b447d1a..ad5edde 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
@@ -17,7 +17,7 @@
 
 RefDepthwiseConvolution2dWorkload::RefDepthwiseConvolution2dWorkload(
         const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
-        : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
+        : RefBaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
     m_Weight = std::make_unique<ScopedTensorHandle>(*(descriptor.m_Weight));
     const TensorInfo& rFilterInfo = m_Weight->GetTensorInfo();
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
index ae93d03..5d4b483 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
@@ -2,7 +2,7 @@
 // Copyright © 2017 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
@@ -12,7 +12,7 @@
 namespace armnn
 {
 
-class RefDepthwiseConvolution2dWorkload : public BaseWorkload<DepthwiseConvolution2dQueueDescriptor> {
+class RefDepthwiseConvolution2dWorkload : public RefBaseWorkload<DepthwiseConvolution2dQueueDescriptor> {
 public:
     explicit RefDepthwiseConvolution2dWorkload(const DepthwiseConvolution2dQueueDescriptor &descriptor,
                                                const WorkloadInfo &info);
diff --git a/src/backends/reference/workloads/RefDequantizeWorkload.hpp b/src/backends/reference/workloads/RefDequantizeWorkload.hpp
index 285c649..8fa8951 100644
--- a/src/backends/reference/workloads/RefDequantizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefDequantizeWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
 
-class RefDequantizeWorkload : public BaseWorkload<DequantizeQueueDescriptor>
+class RefDequantizeWorkload : public RefBaseWorkload<DequantizeQueueDescriptor>
 {
 public:
-    using BaseWorkload<DequantizeQueueDescriptor>::m_Data;
-    using BaseWorkload<DequantizeQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<DequantizeQueueDescriptor>::m_Data;
+    using RefBaseWorkload<DequantizeQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
index 4bc9eb1..5f01db3 100644
--- a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefDetectionPostProcessWorkload::RefDetectionPostProcessWorkload(
         const DetectionPostProcessQueueDescriptor& descriptor, const WorkloadInfo& info)
-        : BaseWorkload<DetectionPostProcessQueueDescriptor>(descriptor, info),
+        : RefBaseWorkload<DetectionPostProcessQueueDescriptor>(descriptor, info),
           m_Anchors(std::make_unique<ScopedTensorHandle>(*(descriptor.m_Anchors))) {}
 
 void RefDetectionPostProcessWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
index 4c3ad42..53b2971 100644
--- a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefDetectionPostProcessWorkload : public BaseWorkload<DetectionPostProcessQueueDescriptor>
+class RefDetectionPostProcessWorkload : public RefBaseWorkload<DetectionPostProcessQueueDescriptor>
 {
 public:
     explicit RefDetectionPostProcessWorkload(const DetectionPostProcessQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
index be15363..3ea51b9 100644
--- a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
@@ -27,7 +27,7 @@
 
 RefElementwiseUnaryWorkload::RefElementwiseUnaryWorkload(const ElementwiseUnaryQueueDescriptor& desc,
                                                          const WorkloadInfo& info)
-    : BaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
+    : RefBaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
 {}
 
 void RefElementwiseUnaryWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
index e055fd0..91229b3 100644
--- a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
@@ -7,16 +7,16 @@
 
 #include "BaseIterator.hpp"
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefElementwiseUnaryWorkload : public BaseWorkload<ElementwiseUnaryQueueDescriptor>
+class RefElementwiseUnaryWorkload : public RefBaseWorkload<ElementwiseUnaryQueueDescriptor>
 {
 public:
-    using BaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
+    using RefBaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
 
     RefElementwiseUnaryWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
diff --git a/src/backends/reference/workloads/RefElementwiseWorkload.cpp b/src/backends/reference/workloads/RefElementwiseWorkload.cpp
index dd7d325..d14ce07 100644
--- a/src/backends/reference/workloads/RefElementwiseWorkload.cpp
+++ b/src/backends/reference/workloads/RefElementwiseWorkload.cpp
@@ -21,7 +21,7 @@
 RefElementwiseWorkload<Functor, ParentDescriptor, DebugString>::RefElementwiseWorkload(
     const ParentDescriptor& desc,
     const WorkloadInfo& info)
-    : BaseWorkload<ParentDescriptor>(desc, info)
+    : RefBaseWorkload<ParentDescriptor>(desc, info)
 {
 }
 
diff --git a/src/backends/reference/workloads/RefElementwiseWorkload.hpp b/src/backends/reference/workloads/RefElementwiseWorkload.hpp
index 4b108e4..065a783 100644
--- a/src/backends/reference/workloads/RefElementwiseWorkload.hpp
+++ b/src/backends/reference/workloads/RefElementwiseWorkload.hpp
@@ -6,7 +6,7 @@
 #pragma once
 
 #include <armnn/Types.hpp>
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "BaseIterator.hpp"
 #include "ElementwiseFunction.hpp"
@@ -18,12 +18,12 @@
 {
 
 template <typename Functor, typename ParentDescriptor, typename armnn::StringMapping::Id DebugString>
-class RefElementwiseWorkload : public BaseWorkload<ParentDescriptor>
+class RefElementwiseWorkload : public RefBaseWorkload<ParentDescriptor>
 {
 public:
     using InType = typename ElementwiseBinaryFunction<Functor>::InType;
     using OutType = typename ElementwiseBinaryFunction<Functor>::OutType;
-    using BaseWorkload<ParentDescriptor>::m_Data;
+    using RefBaseWorkload<ParentDescriptor>::m_Data;
 
     RefElementwiseWorkload(const ParentDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
diff --git a/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp b/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
index 53b3375..85dc6af 100644
--- a/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
+++ b/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefFillWorkload.hpp b/src/backends/reference/workloads/RefFillWorkload.hpp
index 56d44b8..d1e0058 100644
--- a/src/backends/reference/workloads/RefFillWorkload.hpp
+++ b/src/backends/reference/workloads/RefFillWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefFillWorkload : public BaseWorkload<FillQueueDescriptor>
+class RefFillWorkload : public RefBaseWorkload<FillQueueDescriptor>
 {
 public:
-    using BaseWorkload<FillQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<FillQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefFloorWorkload.hpp b/src/backends/reference/workloads/RefFloorWorkload.hpp
index 1a532f7..6237ff0 100644
--- a/src/backends/reference/workloads/RefFloorWorkload.hpp
+++ b/src/backends/reference/workloads/RefFloorWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefFloorWorkload : public BaseWorkload<FloorQueueDescriptor>
+class RefFloorWorkload : public RefBaseWorkload<FloorQueueDescriptor>
 {
 public:
-    using BaseWorkload<FloorQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<FloorQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
index 5a7951e..c6ea147 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
@@ -14,7 +14,7 @@
 {
 RefFullyConnectedWorkload::RefFullyConnectedWorkload(
     const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info)
-        : BaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
+        : RefBaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
 {
 }
 
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
index 3ee4a4a..432a887 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "BaseIterator.hpp"
 #include "Decoders.hpp"
@@ -15,7 +15,7 @@
 namespace armnn
 {
 
-class RefFullyConnectedWorkload : public BaseWorkload<FullyConnectedQueueDescriptor>
+class RefFullyConnectedWorkload : public RefBaseWorkload<FullyConnectedQueueDescriptor>
 {
 public:
     explicit RefFullyConnectedWorkload(const FullyConnectedQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefGatherWorkload.hpp b/src/backends/reference/workloads/RefGatherWorkload.hpp
index a2698e3..ec880a5 100644
--- a/src/backends/reference/workloads/RefGatherWorkload.hpp
+++ b/src/backends/reference/workloads/RefGatherWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include <armnn/TypesUtils.hpp>
@@ -16,10 +16,10 @@
 namespace armnn
 {
 
-class RefGatherWorkload : public BaseWorkload<GatherQueueDescriptor>
+class RefGatherWorkload : public RefBaseWorkload<GatherQueueDescriptor>
 {
 public:
-    using BaseWorkload<GatherQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<GatherQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
index e642dc9..c103a6b 100644
--- a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
@@ -16,7 +16,7 @@
 RefInstanceNormalizationWorkload::RefInstanceNormalizationWorkload(
     const InstanceNormalizationQueueDescriptor& descriptor,
     const WorkloadInfo& info)
-    : BaseWorkload<InstanceNormalizationQueueDescriptor>(descriptor, info) {}
+    : RefBaseWorkload<InstanceNormalizationQueueDescriptor>(descriptor, info) {}
 
 void RefInstanceNormalizationWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
index 3283c44..a4b2dd3 100644
--- a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefInstanceNormalizationWorkload : public BaseWorkload<InstanceNormalizationQueueDescriptor>
+class RefInstanceNormalizationWorkload : public RefBaseWorkload<InstanceNormalizationQueueDescriptor>
 {
 public:
     explicit RefInstanceNormalizationWorkload(const InstanceNormalizationQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp b/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
index ca31503..f6fcff3 100644
--- a/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
@@ -22,7 +22,7 @@
 RefL2NormalizationWorkload::RefL2NormalizationWorkload(
         const L2NormalizationQueueDescriptor& descriptor,
         const WorkloadInfo& info)
-    : BaseWorkload<L2NormalizationQueueDescriptor>(descriptor, info) {}
+    : RefBaseWorkload<L2NormalizationQueueDescriptor>(descriptor, info) {}
 
 void RefL2NormalizationWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp b/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
index dd129c6..c64e2ea 100644
--- a/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefL2NormalizationWorkload : public BaseWorkload<L2NormalizationQueueDescriptor>
+class RefL2NormalizationWorkload : public RefBaseWorkload<L2NormalizationQueueDescriptor>
 {
 public:
     explicit RefL2NormalizationWorkload(const L2NormalizationQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp b/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
index 9f87def..91ad5f6 100644
--- a/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefLogSoftmaxWorkload : public BaseWorkload<LogSoftmaxQueueDescriptor>
+class RefLogSoftmaxWorkload : public RefBaseWorkload<LogSoftmaxQueueDescriptor>
 {
 public:
-    using BaseWorkload<LogSoftmaxQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<LogSoftmaxQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp b/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
index f187e0c..f0cb846 100644
--- a/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
@@ -19,7 +19,7 @@
 
 RefLogicalBinaryWorkload::RefLogicalBinaryWorkload(const LogicalBinaryQueueDescriptor& desc,
                                                    const WorkloadInfo& info)
-    : BaseWorkload<LogicalBinaryQueueDescriptor>(desc, info)
+    : RefBaseWorkload<LogicalBinaryQueueDescriptor>(desc, info)
 {}
 
 void RefLogicalBinaryWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp b/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
index 053de7d..797d937 100644
--- a/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
@@ -7,16 +7,16 @@
 
 #include "BaseIterator.hpp"
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefLogicalBinaryWorkload : public BaseWorkload<LogicalBinaryQueueDescriptor>
+class RefLogicalBinaryWorkload : public RefBaseWorkload<LogicalBinaryQueueDescriptor>
 {
 public:
-    using BaseWorkload<LogicalBinaryQueueDescriptor>::m_Data;
+    using RefBaseWorkload<LogicalBinaryQueueDescriptor>::m_Data;
 
     RefLogicalBinaryWorkload(const LogicalBinaryQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
diff --git a/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp b/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
index bef2bdc..ec0aa0e 100644
--- a/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
@@ -19,7 +19,7 @@
 
 RefLogicalUnaryWorkload::RefLogicalUnaryWorkload(const ElementwiseUnaryQueueDescriptor& desc,
                                                  const WorkloadInfo& info)
-    : BaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
+    : RefBaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
 {}
 
 void RefLogicalUnaryWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp b/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
index 008d24f..ebd5826 100644
--- a/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
@@ -7,16 +7,16 @@
 
 #include "BaseIterator.hpp"
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefLogicalUnaryWorkload : public BaseWorkload<ElementwiseUnaryQueueDescriptor>
+class RefLogicalUnaryWorkload : public RefBaseWorkload<ElementwiseUnaryQueueDescriptor>
 {
 public:
-    using BaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
+    using RefBaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
 
     RefLogicalUnaryWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
diff --git a/src/backends/reference/workloads/RefLstmWorkload.cpp b/src/backends/reference/workloads/RefLstmWorkload.cpp
index 1ff6f50..8609811 100644
--- a/src/backends/reference/workloads/RefLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefLstmWorkload.cpp
@@ -15,7 +15,7 @@
 {
 
 RefLstmWorkload::RefLstmWorkload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
-    : BaseWorkload<LstmQueueDescriptor>(descriptor, info)
+    : RefBaseWorkload<LstmQueueDescriptor>(descriptor, info)
     , m_InputToInputWeightsTensor     (AssignScopedTensorHandle(descriptor.m_InputToInputWeights))
     , m_InputToForgetWeightsTensor    (AssignScopedTensorHandle(descriptor.m_InputToForgetWeights))
     , m_InputToCellWeightsTensor      (AssignScopedTensorHandle(descriptor.m_InputToCellWeights))
diff --git a/src/backends/reference/workloads/RefLstmWorkload.hpp b/src/backends/reference/workloads/RefLstmWorkload.hpp
index 72f6360..57526c9 100644
--- a/src/backends/reference/workloads/RefLstmWorkload.hpp
+++ b/src/backends/reference/workloads/RefLstmWorkload.hpp
@@ -7,13 +7,13 @@
 
 #include <armnn/TypesUtils.hpp>
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefLstmWorkload : public BaseWorkload<LstmQueueDescriptor>
+class RefLstmWorkload : public RefBaseWorkload<LstmQueueDescriptor>
 {
 public:
     explicit RefLstmWorkload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefMeanWorkload.cpp b/src/backends/reference/workloads/RefMeanWorkload.cpp
index 7941ce2..23abaf8 100644
--- a/src/backends/reference/workloads/RefMeanWorkload.cpp
+++ b/src/backends/reference/workloads/RefMeanWorkload.cpp
@@ -16,7 +16,7 @@
 {
 
 RefMeanWorkload::RefMeanWorkload(const MeanQueueDescriptor& descriptor, const WorkloadInfo& info)
-  :BaseWorkload<MeanQueueDescriptor>(descriptor, info) {}
+  :RefBaseWorkload<MeanQueueDescriptor>(descriptor, info) {}
 
 void RefMeanWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefMeanWorkload.hpp b/src/backends/reference/workloads/RefMeanWorkload.hpp
index 2825d66..c4c6a12 100644
--- a/src/backends/reference/workloads/RefMeanWorkload.hpp
+++ b/src/backends/reference/workloads/RefMeanWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "Decoders.hpp"
@@ -14,7 +14,7 @@
 namespace armnn
 {
 
-class RefMeanWorkload : public BaseWorkload<MeanQueueDescriptor>
+class RefMeanWorkload : public RefBaseWorkload<MeanQueueDescriptor>
 {
 public:
     explicit RefMeanWorkload (const MeanQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefNormalizationWorkload.cpp b/src/backends/reference/workloads/RefNormalizationWorkload.cpp
index 36828ac..613868d 100644
--- a/src/backends/reference/workloads/RefNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefNormalizationWorkload.cpp
@@ -158,7 +158,7 @@
 
 RefNormalizationWorkload::RefNormalizationWorkload(const NormalizationQueueDescriptor& descriptor,
                                                    const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
 {}
 
 void RefNormalizationWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefNormalizationWorkload.hpp b/src/backends/reference/workloads/RefNormalizationWorkload.hpp
index b152072..5218e1e 100644
--- a/src/backends/reference/workloads/RefNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefNormalizationWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefNormalizationWorkload : public BaseWorkload<NormalizationQueueDescriptor>
+class RefNormalizationWorkload : public RefBaseWorkload<NormalizationQueueDescriptor>
 {
 public:
     explicit RefNormalizationWorkload(const NormalizationQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefPadWorkload.hpp b/src/backends/reference/workloads/RefPadWorkload.hpp
index 18c406a..c587105 100644
--- a/src/backends/reference/workloads/RefPadWorkload.hpp
+++ b/src/backends/reference/workloads/RefPadWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefPadWorkload : public BaseWorkload<PadQueueDescriptor>
+class RefPadWorkload : public RefBaseWorkload<PadQueueDescriptor>
 {
 public:
-    using BaseWorkload<PadQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<PadQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefPermuteWorkload.hpp b/src/backends/reference/workloads/RefPermuteWorkload.hpp
index 9424441..d1e4452 100644
--- a/src/backends/reference/workloads/RefPermuteWorkload.hpp
+++ b/src/backends/reference/workloads/RefPermuteWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 #include <armnn/TypesUtils.hpp>
 
diff --git a/src/backends/reference/workloads/RefPooling2dWorkload.hpp b/src/backends/reference/workloads/RefPooling2dWorkload.hpp
index 125fea8..a073e39 100644
--- a/src/backends/reference/workloads/RefPooling2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefPooling2dWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "Decoders.hpp"
@@ -13,10 +13,10 @@
 
 namespace armnn
 {
-class RefPooling2dWorkload : public BaseWorkload<Pooling2dQueueDescriptor>
+class RefPooling2dWorkload : public RefBaseWorkload<Pooling2dQueueDescriptor>
 {
 public:
-    using BaseWorkload<Pooling2dQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<Pooling2dQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefPooling3dWorkload.hpp b/src/backends/reference/workloads/RefPooling3dWorkload.hpp
index 911c438..92bc476 100644
--- a/src/backends/reference/workloads/RefPooling3dWorkload.hpp
+++ b/src/backends/reference/workloads/RefPooling3dWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "Decoders.hpp"
@@ -13,10 +13,10 @@
 
 namespace armnn
 {
-class RefPooling3dWorkload : public BaseWorkload<Pooling3dQueueDescriptor>
+class RefPooling3dWorkload : public RefBaseWorkload<Pooling3dQueueDescriptor>
 {
 public:
-    using BaseWorkload<Pooling3dQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<Pooling3dQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefPreluWorkload.cpp b/src/backends/reference/workloads/RefPreluWorkload.cpp
index c1d8de2..94eeea1 100644
--- a/src/backends/reference/workloads/RefPreluWorkload.cpp
+++ b/src/backends/reference/workloads/RefPreluWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefPreluWorkload::RefPreluWorkload(const PreluQueueDescriptor& descriptor,
                                    const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
 {}
 
 void RefPreluWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefPreluWorkload.hpp b/src/backends/reference/workloads/RefPreluWorkload.hpp
index b5c97df..51ba2c1 100644
--- a/src/backends/reference/workloads/RefPreluWorkload.hpp
+++ b/src/backends/reference/workloads/RefPreluWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefPreluWorkload : public BaseWorkload<PreluQueueDescriptor>
+class RefPreluWorkload : public RefBaseWorkload<PreluQueueDescriptor>
 {
 public:
     explicit RefPreluWorkload(const PreluQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefQLstmWorkload.cpp b/src/backends/reference/workloads/RefQLstmWorkload.cpp
index dc29d0b..74f5f1e 100644
--- a/src/backends/reference/workloads/RefQLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefQLstmWorkload.cpp
@@ -14,7 +14,7 @@
 {
 
 RefQLstmWorkload::RefQLstmWorkload(const QLstmQueueDescriptor &descriptor, const WorkloadInfo &info)
-        : BaseWorkload<QLstmQueueDescriptor>(descriptor, info)
+        : RefBaseWorkload<QLstmQueueDescriptor>(descriptor, info)
         , m_InputToInputWeightsTensor     (AssignScopedTensorHandle(descriptor.m_InputToInputWeights))
         , m_InputToForgetWeightsTensor    (AssignScopedTensorHandle(descriptor.m_InputToForgetWeights))
         , m_InputToCellWeightsTensor      (AssignScopedTensorHandle(descriptor.m_InputToCellWeights))
diff --git a/src/backends/reference/workloads/RefQLstmWorkload.hpp b/src/backends/reference/workloads/RefQLstmWorkload.hpp
index 093cfd1..0e64a38 100644
--- a/src/backends/reference/workloads/RefQLstmWorkload.hpp
+++ b/src/backends/reference/workloads/RefQLstmWorkload.hpp
@@ -7,13 +7,13 @@
 
 #include <armnn/TypesUtils.hpp>
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefQLstmWorkload : public BaseWorkload<QLstmQueueDescriptor>
+class RefQLstmWorkload : public RefBaseWorkload<QLstmQueueDescriptor>
 {
 public:
     explicit RefQLstmWorkload(const QLstmQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefQuantizeWorkload.cpp b/src/backends/reference/workloads/RefQuantizeWorkload.cpp
index 35791e6..10ef0e5 100644
--- a/src/backends/reference/workloads/RefQuantizeWorkload.cpp
+++ b/src/backends/reference/workloads/RefQuantizeWorkload.cpp
@@ -29,7 +29,7 @@
 } //namespace
 
 RefQuantizeWorkload::RefQuantizeWorkload(const QuantizeQueueDescriptor& descriptor, const WorkloadInfo &info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
     , m_NumElements(info.m_InputTensorInfos[0].GetNumElements())
 {
 }
diff --git a/src/backends/reference/workloads/RefQuantizeWorkload.hpp b/src/backends/reference/workloads/RefQuantizeWorkload.hpp
index a32efa7..e382410 100644
--- a/src/backends/reference/workloads/RefQuantizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefQuantizeWorkload.hpp
@@ -5,14 +5,14 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
 
 namespace armnn {
 
-class RefQuantizeWorkload : public BaseWorkload<QuantizeQueueDescriptor>
+class RefQuantizeWorkload : public RefBaseWorkload<QuantizeQueueDescriptor>
 {
 public:
     RefQuantizeWorkload(const QuantizeQueueDescriptor& descriptor, const WorkloadInfo &info);
diff --git a/src/backends/reference/workloads/RefRankWorkload.hpp b/src/backends/reference/workloads/RefRankWorkload.hpp
index e1f30c5..000828f 100644
--- a/src/backends/reference/workloads/RefRankWorkload.hpp
+++ b/src/backends/reference/workloads/RefRankWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "RefWorkloadUtils.hpp"
@@ -13,10 +13,10 @@
 namespace armnn
 {
 
-struct RefRankWorkload : public BaseWorkload<RankQueueDescriptor>
+struct RefRankWorkload : public RefBaseWorkload<RankQueueDescriptor>
 {
 public:
-    using BaseWorkload<RankQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<RankQueueDescriptor>::RefBaseWorkload;
     virtual void Execute() const override
     {
         Execute(m_Data.m_Inputs, m_Data.m_Outputs);
diff --git a/src/backends/reference/workloads/RefReduceWorkload.cpp b/src/backends/reference/workloads/RefReduceWorkload.cpp
index 821e828..62881da 100644
--- a/src/backends/reference/workloads/RefReduceWorkload.cpp
+++ b/src/backends/reference/workloads/RefReduceWorkload.cpp
@@ -16,7 +16,7 @@
 RefReduceWorkload::RefReduceWorkload(
     const ReduceQueueDescriptor& descriptor,
     const WorkloadInfo& info)
-    : BaseWorkload<ReduceQueueDescriptor>(descriptor, info) {}
+    : RefBaseWorkload<ReduceQueueDescriptor>(descriptor, info) {}
 
 void RefReduceWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefReduceWorkload.hpp b/src/backends/reference/workloads/RefReduceWorkload.hpp
index d2280cc..d759bc2 100644
--- a/src/backends/reference/workloads/RefReduceWorkload.hpp
+++ b/src/backends/reference/workloads/RefReduceWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefReduceWorkload : public BaseWorkload<ReduceQueueDescriptor>
+class RefReduceWorkload : public RefBaseWorkload<ReduceQueueDescriptor>
 {
 public:
     explicit RefReduceWorkload(const ReduceQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefReshapeWorkload.hpp b/src/backends/reference/workloads/RefReshapeWorkload.hpp
index 26a86c1..7596685 100644
--- a/src/backends/reference/workloads/RefReshapeWorkload.hpp
+++ b/src/backends/reference/workloads/RefReshapeWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefReshapeWorkload : public BaseWorkload<ReshapeQueueDescriptor>
+class RefReshapeWorkload : public RefBaseWorkload<ReshapeQueueDescriptor>
 {
 public:
-    using BaseWorkload<ReshapeQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ReshapeQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefResizeWorkload.hpp b/src/backends/reference/workloads/RefResizeWorkload.hpp
index 82949ed..f774719 100644
--- a/src/backends/reference/workloads/RefResizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefResizeWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefResizeWorkload : public BaseWorkload<ResizeQueueDescriptor>
+class RefResizeWorkload : public RefBaseWorkload<ResizeQueueDescriptor>
 {
 public:
-    using BaseWorkload<ResizeQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ResizeQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefShapeWorkload.hpp b/src/backends/reference/workloads/RefShapeWorkload.hpp
index 209cccd..b7ed761 100644
--- a/src/backends/reference/workloads/RefShapeWorkload.hpp
+++ b/src/backends/reference/workloads/RefShapeWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "RefWorkloadUtils.hpp"
@@ -13,10 +13,10 @@
 namespace armnn
 {
 
-struct RefShapeWorkload : public BaseWorkload<ShapeQueueDescriptor>
+struct RefShapeWorkload : public RefBaseWorkload<ShapeQueueDescriptor>
 {
 public:
-    using BaseWorkload<ShapeQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ShapeQueueDescriptor>::RefBaseWorkload;
     virtual void Execute() const override
     {
         Execute(m_Data.m_Inputs, m_Data.m_Outputs);
diff --git a/src/backends/reference/workloads/RefSliceWorkload.hpp b/src/backends/reference/workloads/RefSliceWorkload.hpp
index 69dae5a..b9dca86 100644
--- a/src/backends/reference/workloads/RefSliceWorkload.hpp
+++ b/src/backends/reference/workloads/RefSliceWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefSliceWorkload : public BaseWorkload<SliceQueueDescriptor>
+class RefSliceWorkload : public RefBaseWorkload<SliceQueueDescriptor>
 {
 public:
-    using BaseWorkload<SliceQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SliceQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefSoftmaxWorkload.hpp b/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
index 42dbb53..cac102a 100644
--- a/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefSoftmaxWorkload : public BaseWorkload<SoftmaxQueueDescriptor>
+class RefSoftmaxWorkload : public RefBaseWorkload<SoftmaxQueueDescriptor>
 {
 public:
-    using BaseWorkload<SoftmaxQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SoftmaxQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
index ec764c7..eb2d93f 100644
--- a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
+++ b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
@@ -4,17 +4,17 @@
 //
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 #include <armnn/TypesUtils.hpp>
 
 namespace armnn
 {
 
-class RefSpaceToBatchNdWorkload : public BaseWorkload<SpaceToBatchNdQueueDescriptor>
+class RefSpaceToBatchNdWorkload : public RefBaseWorkload<SpaceToBatchNdQueueDescriptor>
 {
 public:
-    using BaseWorkload<SpaceToBatchNdQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SpaceToBatchNdQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp b/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
index bc71fde..17f8d2f 100644
--- a/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
+++ b/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
@@ -4,17 +4,17 @@
 //
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 #include <armnn/TypesUtils.hpp>
 
 namespace armnn
 {
 
-class RefSpaceToDepthWorkload : public BaseWorkload<SpaceToDepthQueueDescriptor>
+class RefSpaceToDepthWorkload : public RefBaseWorkload<SpaceToDepthQueueDescriptor>
 {
 public:
-    using BaseWorkload<SpaceToDepthQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SpaceToDepthQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefSplitterWorkload.hpp b/src/backends/reference/workloads/RefSplitterWorkload.hpp
index 28dc83d..0b72bb9 100644
--- a/src/backends/reference/workloads/RefSplitterWorkload.hpp
+++ b/src/backends/reference/workloads/RefSplitterWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
@@ -13,10 +13,10 @@
 namespace armnn
 {
 
-class RefSplitterWorkload : public BaseWorkload<SplitterQueueDescriptor>
+class RefSplitterWorkload : public RefBaseWorkload<SplitterQueueDescriptor>
 {
 public:
-    using BaseWorkload<SplitterQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SplitterQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefStackWorkload.cpp b/src/backends/reference/workloads/RefStackWorkload.cpp
index 3f7fd7b..f57e6e0 100644
--- a/src/backends/reference/workloads/RefStackWorkload.cpp
+++ b/src/backends/reference/workloads/RefStackWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefStackWorkload::RefStackWorkload(const StackQueueDescriptor& descriptor,
                                    const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
 {}
 
 void RefStackWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefStackWorkload.hpp b/src/backends/reference/workloads/RefStackWorkload.hpp
index fbca11b..19f4a7b 100644
--- a/src/backends/reference/workloads/RefStackWorkload.hpp
+++ b/src/backends/reference/workloads/RefStackWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefStackWorkload : public BaseWorkload<StackQueueDescriptor>
+class RefStackWorkload : public RefBaseWorkload<StackQueueDescriptor>
 {
 public:
     explicit RefStackWorkload(const StackQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
index 336a687..41fe4c3 100644
--- a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
+++ b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
@@ -12,7 +12,7 @@
 
 RefStridedSliceWorkload::RefStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor,
                                                  const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
 {}
 
 void RefStridedSliceWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
index d2ffca7..ea443cf 100644
--- a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
+++ b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
@@ -5,12 +5,12 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
 
-class RefStridedSliceWorkload : public BaseWorkload<StridedSliceQueueDescriptor>
+class RefStridedSliceWorkload : public RefBaseWorkload<StridedSliceQueueDescriptor>
 {
 public:
     RefStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
index 8665648..64a2d4c 100644
--- a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefTransposeConvolution2dWorkload::RefTransposeConvolution2dWorkload(
     const TransposeConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) :
-    BaseWorkload<TransposeConvolution2dQueueDescriptor>(descriptor, info)
+    RefBaseWorkload<TransposeConvolution2dQueueDescriptor>(descriptor, info)
 {
     // set up weights decoder
     m_Weights = std::make_unique<ScopedTensorHandle>(*(descriptor.m_Weight));
diff --git a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
index aa2546f..6bcee9a 100644
--- a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
@@ -9,12 +9,12 @@
 #include "Encoders.hpp"
 
 #include <armnn/backends/TensorHandle.hpp>
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
 
-class RefTransposeConvolution2dWorkload : public BaseWorkload<TransposeConvolution2dQueueDescriptor>
+class RefTransposeConvolution2dWorkload : public RefBaseWorkload<TransposeConvolution2dQueueDescriptor>
 {
 public:
     RefTransposeConvolution2dWorkload(const TransposeConvolution2dQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefTransposeWorkload.hpp b/src/backends/reference/workloads/RefTransposeWorkload.hpp
index bf59de7..b8c3649 100644
--- a/src/backends/reference/workloads/RefTransposeWorkload.hpp
+++ b/src/backends/reference/workloads/RefTransposeWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 #include <armnn/TypesUtils.hpp>
 
diff --git a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
index 311fa18..d447a46 100644
--- a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
@@ -19,7 +19,7 @@
 RefUnidirectionalSequenceLstmWorkload::RefUnidirectionalSequenceLstmWorkload(
     const UnidirectionalSequenceLstmQueueDescriptor& descriptor,
     const WorkloadInfo& info)
-    : BaseWorkload<UnidirectionalSequenceLstmQueueDescriptor>(descriptor, info)
+    : RefBaseWorkload<UnidirectionalSequenceLstmQueueDescriptor>(descriptor, info)
     , m_InputToInputWeightsTensor     (AssignScopedTensorHandle(descriptor.m_InputToInputWeights))
     , m_InputToForgetWeightsTensor    (AssignScopedTensorHandle(descriptor.m_InputToForgetWeights))
     , m_InputToCellWeightsTensor      (AssignScopedTensorHandle(descriptor.m_InputToCellWeights))
diff --git a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.hpp b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.hpp
index d0c000f..7a91cee 100644
--- a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.hpp
+++ b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.hpp
@@ -7,7 +7,7 @@
 
 #include <armnn/TypesUtils.hpp>
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "Encoders.hpp"
@@ -16,7 +16,7 @@
 namespace armnn
 {
 
-class RefUnidirectionalSequenceLstmWorkload : public BaseWorkload<UnidirectionalSequenceLstmQueueDescriptor>
+class RefUnidirectionalSequenceLstmWorkload : public RefBaseWorkload<UnidirectionalSequenceLstmQueueDescriptor>
 {
 public:
     explicit RefUnidirectionalSequenceLstmWorkload(const UnidirectionalSequenceLstmQueueDescriptor& descriptor,