Typo in Guide

* Only file changed is shim/BuildGuideShimSupportLibrary.md
* All other files are merge squash from Arm NN 22.02

Signed-off-by: Kevin May <kevin.may@arm.com>
Change-Id: Id82a6e9ac8abf74c1073c08744712f50e98dece0
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index fd7279a..bcceaf4 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -139,6 +139,13 @@
     bool useExternalMemoryManager = false;
     bool useInternalMemoryManager = false;
     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+    if (!networkProperties.m_AsyncEnabled)
+    {
+        m_IsInputImported = std::vector<bool>(order.GetNumInputs(), false);
+        m_IsOutputImported = std::vector<bool>(order.GetNumOutputs(), false);
+    }
+
     for (auto&& layer : order)
     {
         auto const& backendId = layer->GetBackendId();
@@ -312,44 +319,6 @@
                     }
                     else
                     {
-                        if (layer->GetNumInputSlots() >= 1)
-                        {
-                            unsigned int inputSlotIndex = 0;
-                            for (auto& inputSlot : layer->GetInputSlots())
-                            {
-                                if (inputSlot.GetConnectedOutputSlot()->GetOwningLayer().GetType() == LayerType::Input)
-                                {
-                                    auto inputLayer =
-                                        PolymorphicDowncast<InputLayer*>(
-                                            &inputSlot.GetConnectedOutputSlot()->GetOwningLayer());
-                                    m_InputWorkloadSlotPairs[inputLayer->GetBindingId()] =
-                                        std::make_pair(m_WorkloadQueue.size(), inputSlotIndex);
-                                }
-                                ++inputSlotIndex;
-                            }
-                        }
-
-                        if (layer->GetNumOutputSlots() >= 1)
-                        {
-                            unsigned int outputSlotIndex = 0;
-                            for (auto& outputSlot : layer->GetOutputSlots())
-                            {
-                                for (unsigned int i = 0; i < outputSlot.GetNumConnections(); i++)
-                                {
-                                    // If any of the connections on this outputSlot are connected to an Output then
-                                    // Add its index within layer->GetOutputSlots() to m_OutputWorkloadSlotPairs
-                                    if (outputSlot.GetConnection(i)->GetOwningLayer().GetType() == LayerType::Output)
-                                    {
-                                        auto outputLayer = PolymorphicDowncast<OutputLayer*>(
-                                            &outputSlot.GetConnection(i)->GetOwningLayer());
-                                        m_OutputWorkloadSlotPairs[outputLayer->GetBindingId()] =
-                                            std::make_pair(m_WorkloadQueue.size(), outputSlotIndex);
-                                        continue;
-                                    }
-                                }
-                                ++outputSlotIndex;
-                            }
-                        }
                         m_WorkloadQueue.push_back(std::move(workload));
                     }
 
@@ -361,6 +330,100 @@
         }
     }
 
+    // Gather information about workloads for inputs & outputs
+    if (!networkProperties.m_AsyncEnabled && m_WorkloadQueue.size() != 0)
+    {
+        const int noOfInputs = armnn::numeric_cast<int>(order.GetNumInputs());
+
+        // Get indices of all workloads connected to each input and
+        // check if they support tensor handle replacement
+        for (const BindableLayer* layer: order.GetInputLayers())
+        {
+            const auto bindingId = layer->GetBindingId();
+
+            bool supportsReplacement = true;
+
+            for (const auto inputSlot: layer->GetOutputSlot(0).GetConnections())
+            {
+                auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(inputSlot->GetOwningLayer()));
+                workloadIndex -= noOfInputs;
+
+                m_InputWorkloadSlotPairs[bindingId].emplace_back(WorkloadIndices{
+                        armnn::numeric_cast<unsigned int>(workloadIndex), inputSlot->GetSlotIndex()});
+
+                auto workload = m_WorkloadQueue[m_InputWorkloadSlotPairs[bindingId].back().m_WorkloadIndex].get();
+                supportsReplacement &= workload->SupportsTensorHandleReplacement();
+            }
+
+            ITensorHandleFactory::FactoryId factoryId = layer->GetOutputSlot(0).GetTensorHandleFactoryId();
+            // Get matching import factory Id
+            ITensorHandleFactory::FactoryId importFactoryId =
+                    m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
+
+            ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId);
+
+            if (supportsReplacement && importFactory)
+            {
+                m_PreImportedInputHandles.emplace_back(
+                        bindingId, importFactory->CreateTensorHandle(layer->GetOutputSlot(0).GetTensorInfo(), false));
+            }
+            else
+            {
+                m_PreImportedInputHandles.emplace_back(bindingId, nullptr);
+            }
+        }
+
+        // Get indices of all workloads connected to each output and
+        // check if they support tensor handle replacement
+        for (const BindableLayer* layer: order.GetOutputLayers())
+        {
+            const auto bindingId = layer->GetBindingId();
+
+            const auto outputSlot = layer->GetInputSlot(0).GetConnectedOutputSlot();
+            auto& indices = m_OutputWorkloadSlotPairs[bindingId];
+
+            auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(outputSlot->GetOwningLayer()));
+            workloadIndex -= noOfInputs;
+
+            indices.m_OutputSlotIndices = WorkloadIndices{numeric_cast<unsigned int>(workloadIndex),
+                                                          outputSlot->CalculateIndexOnOwner()};
+
+            bool supportsReplacement = true;
+            auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
+            supportsReplacement &= outputWorkload->SupportsTensorHandleReplacement();
+
+            for (auto &inputSlot: outputSlot->GetConnections())
+            {
+                if(inputSlot->GetOwningLayer().GetType() != LayerType::Output)
+                {
+                    auto inWorkloadIndex = std::distance(order.begin(),
+                                                         order.GetPosInGraph(inputSlot->GetOwningLayer()));
+                    inWorkloadIndex -= noOfInputs;
+                    indices.m_InputSlotIndices.emplace_back(WorkloadIndices{numeric_cast<unsigned int>(inWorkloadIndex),
+                                                            inputSlot->GetSlotIndex()});
+                    auto inputWorkload = m_WorkloadQueue[indices.m_InputSlotIndices.back().m_WorkloadIndex].get();
+                    supportsReplacement &= inputWorkload->SupportsTensorHandleReplacement();
+                }
+            }
+
+            ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
+            // Get matching import factory Id
+            ITensorHandleFactory::FactoryId importFactoryId =
+                    m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
+            ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId);
+
+            if (supportsReplacement && importFactory)
+            {
+                m_PreImportedOutputHandles.emplace_back(
+                        bindingId, importFactory->CreateTensorHandle(outputSlot->GetTensorInfo(), false));
+            }
+            else
+            {
+                m_PreImportedOutputHandles.emplace_back(bindingId, nullptr);
+            }
+        }
+    }
+
     for (auto&& workloadFactory : m_WorkloadFactories)
     {
         workloadFactory.second->AfterWorkloadsCreated();
@@ -699,77 +762,133 @@
         m_InputQueue.clear();
         m_InputQueue.reserve(graph.GetNumInputs());
 
+        if (preImportedInputIds.size() > graph.GetNumInputs())
+        {
+            throw InvalidArgumentException("Invalid number of preImportedInputIds");
+        }
+
+        unsigned int inputIndex = 0;
+        unsigned int importedInputIdIndex = 0;
+        std::sort(preImportedInputIds.begin(), preImportedInputIds.end());
         for (const BindableLayer* inputLayer : graph.GetInputLayers())
         {
-            if (preImportedInputIds.size() > graph.GetNumInputs())
+            if (importedInputIdIndex < preImportedInputIds.size() &&
+                inputIndex == preImportedInputIds[importedInputIdIndex])
             {
-                throw InvalidArgumentException("Invalid number of preImportedInputIds");
-            }
-            auto layerBindingId = inputLayer->GetBindingId();
-            auto it = std::find_if(preImportedInputIds.begin(), preImportedInputIds.end(),
-                                   [=](auto preImportedInputId)
-            {
-                return m_PreImportedInputHandles[preImportedInputId].m_LayerBindingId == layerBindingId;
-            });
+                // Only replace tensorhandles if they have not already been replaced
+                if (!m_IsInputImported[inputIndex])
+                {
+                    auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get();
 
-            if (it == preImportedInputIds.end())
+                    for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()])
+                    {
+                        auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
+                        workload->ReplaceInputTensorHandle(outputTensorHandle, workloadInfo.m_SlotIndex);
+                    }
+                    m_IsInputImported[inputIndex] = true;
+                }
+                importedInputIdIndex++;
+            }
+            else
             {
+                if (m_IsInputImported[inputIndex])
+                {
+                    OutputHandler& handler = const_cast<OutputHandler&>(inputLayer->GetOutputHandler(0));
+
+                    for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()])
+                    {
+                        auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
+                        workload->ReplaceInputTensorHandle(handler.GetData(), workloadInfo.m_SlotIndex);
+                    }
+
+                    m_IsInputImported[inputIndex] = false;
+                }
+
                 // InputTensorHandle is not imported yet, process to enqueue input
                 const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
                 EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
             }
+            inputIndex++;
         }
     }
-
     // For each output to the network, call EnqueueOutput with the data passed by the user.
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
         m_OutputQueue.clear();
         m_OutputQueue.reserve(graph.GetNumOutputs());
 
+        if (preImportedOutputIds.size() > graph.GetNumOutputs())
+        {
+            throw InvalidArgumentException("Invalid number of preImportedOutputIds");
+        }
+
+        unsigned int outputIndex = 0;
+        unsigned int importedOutputIdIndex = 0;
+        std::sort(preImportedOutputIds.begin(), preImportedOutputIds.end());
         for (const BindableLayer* outputLayer : graph.GetOutputLayers())
         {
-            if (preImportedOutputIds.size() > graph.GetNumOutputs())
+            if (importedOutputIdIndex < preImportedOutputIds.size() &&
+                outputIndex == preImportedOutputIds[importedOutputIdIndex])
             {
-                throw InvalidArgumentException("Invalid number of preImportedOutputIds");
-            }
-            auto layerBindingId = outputLayer->GetBindingId();
-            auto it = std::find_if(preImportedOutputIds.begin(), preImportedOutputIds.end(),
-                                   [=](auto preImportedOutputId)
-            {
-                return m_PreImportedOutputHandles[preImportedOutputId].m_LayerBindingId == layerBindingId;
-            });
+                // Only replace tensorhandles if they have not already been replaced
+                ITensorHandle* inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get();
 
-            const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
+                if (!m_IsOutputImported[outputIndex])
+                {
+                    const auto bindingId = outputLayer->GetBindingId();
+                    const auto& indices = m_OutputWorkloadSlotPairs[bindingId];
 
-            if (it == preImportedOutputIds.end())
-            {
-                // OutputTensorHandle is not imported yet, process to enqueue Output
-                EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
-            }
-            else
-            {
-                // Insert synchronization workload for the imported output
-                OutputQueueDescriptor outputQueueDescriptor;
-                WorkloadInfo info;
+                    auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
 
-                outputQueueDescriptor.m_Outputs.push_back(pin.GetTensorHandle());
-                info.m_OutputTensorInfos.push_back(pin.GetTensorInfo());
+                    outputWorkload->ReplaceOutputTensorHandle(inputTensorHandle,
+                                                              indices.m_OutputSlotIndices.m_SlotIndex);
 
-                // Gets the output handler from the previous node.
-                const OutputHandler& outputHandler =
-                    outputLayer->GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();
+                    for (const auto& workloadInfo: indices.m_InputSlotIndices)
+                    {
+                        auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
+                        inputWorkload->ReplaceInputTensorHandle(inputTensorHandle, workloadInfo.m_SlotIndex);
+                    }
+                    m_IsOutputImported[outputIndex] = true;
+                }
 
-                const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
-                ITensorHandle* inputTensorHandle = outputHandler.GetData();
                 ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
                 MemSyncQueueDescriptor syncDesc;
                 syncDesc.m_Inputs.push_back(inputTensorHandle);
-                info.m_InputTensorInfos.push_back(inputTensorInfo);
+                WorkloadInfo info;
+                info.m_InputTensorInfos.push_back(
+                        outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo());
                 auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
                 ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
                 m_OutputQueue.push_back(move(syncWorkload));
+                importedOutputIdIndex++;
             }
+            else
+            {
+                if (m_IsOutputImported[outputIndex])
+                {
+                    const auto bindingId = outputLayer->GetBindingId();
+                    const auto& indices = m_OutputWorkloadSlotPairs[bindingId];
+
+                    auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
+                    const OutputHandler& outputHandler =
+                            outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOutputHandler();
+
+                    outputWorkload->ReplaceOutputTensorHandle(
+                            outputHandler.GetData(), indices.m_OutputSlotIndices.m_SlotIndex);
+
+                    for (const auto& workloadInfo: indices.m_InputSlotIndices)
+                    {
+                        auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
+                        inputWorkload->ReplaceInputTensorHandle(outputHandler.GetData(), workloadInfo.m_SlotIndex);
+                    }
+                    m_IsOutputImported[outputIndex] = false;
+                }
+
+                const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
+                // OutputTensorHandle is not imported yet, process to enqueue Output
+                EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+            }
+            outputIndex++;
         }
     }
 
@@ -806,6 +925,7 @@
         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
         timelineUtils->Commit();
     }
+
     return executionSucceeded ? Status::Success : Status::Failure;
 }
 
@@ -1186,14 +1306,13 @@
 std::vector<ImportedInputId> LoadedNetwork::ImportInputs(const InputTensors& inputTensors,
                                                          MemorySource forceImportMemorySource)
 {
-    if (!m_NetworkProperties.m_ImportEnabled)
+    if (!m_NetworkProperties.m_AsyncEnabled)
     {
         // Cannot import if import is not enabled and forceImportMemorySource is undefined
         if (forceImportMemorySource == MemorySource::Undefined)
         {
             throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
         }
-        // If forceImportMemorySource is defined, try import if memory is aligned
         if (inputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumInputs())
         {
             throw MemoryImportException("ImportInputs: Force Import failed, incorrect number of tensors");
@@ -1201,85 +1320,42 @@
 
         std::vector<ImportedInputId> importedInputs;
         Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
-        for (auto inputTensor : inputTensors)
+        unsigned int inputIndex = 0;
+        for (const BindableLayer* inputLayer : graph.GetInputLayers())
         {
-            auto layerBindingId = inputTensor.first;
-            auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer)
+            auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get();
+
+            if (!outputTensorHandle)
             {
-                return layer->GetBindingId() == layerBindingId;
+                inputIndex++;
+                continue;
+            }
+
+            auto layerBindingId = inputLayer->GetBindingId();
+            auto it = std::find_if(inputTensors.begin(), inputTensors.end(), [=](const auto& inputTensor)
+            {
+                return inputTensor.first == layerBindingId;
             });
 
-            if (it == graph.GetInputLayers().end())
+            if (it == inputTensors.end())
             {
-                throw MemoryImportException(fmt::format(
-                    "ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId));
+                inputIndex++;
+                continue;
             }
 
-            const Layer* layer = *it;
-            if (layer->GetType() != LayerType::Input)
-            {
-                throw InvalidArgumentException("ImportInputs: given layer not an InputLayer");
-            }
-            const OutputSlot& outputSlot = layer->GetOutputSlots()[0];
-            ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
-            // Get matching import factory Id
-            ITensorHandleFactory::FactoryId importFactoryId =
-                m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
-            ITensorHandleFactory* importFactory =
-                m_TensorHandleFactoryRegistry.GetFactory(importFactoryId, forceImportMemorySource);
-            if (!importFactory)
-            {
-                throw MemoryImportException("ImportInputs: Force Import failed, cannot find matching Import Factory");
-            }
-
-            OutputHandler& handler = const_cast<OutputHandler&>(layer->GetOutputHandler(0));
-            handler.SetAllocatedData();
-            handler.CreateTensorHandles(*importFactory, false);
-            ITensorHandle* outputTensorHandle = handler.GetData();
+            const auto& inputTensor = *it;
             std::unique_ptr<ITensorHandle> passThroughTensorHandle =
                     std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
                                                                    inputTensor.second.GetMemoryArea());
-            // Check if the input memory can be imported
-            if (outputTensorHandle->CanBeImported(passThroughTensorHandle->Map(), forceImportMemorySource))
-            {
-                passThroughTensorHandle->Unmap();
-                if (outputTensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource))
-                {
-                    passThroughTensorHandle->Unmap();
-                    try
-                    {
-                        m_WorkloadQueue[m_InputWorkloadSlotPairs[layerBindingId].first].get()->ReplaceInputTensorHandle(
-                            outputTensorHandle, m_InputWorkloadSlotPairs[layerBindingId].second);
-                        importedInputs.push_back(m_CurImportedInputId++);
-                        // For force import, we want OutputHandler to own the TensorHandle,
-                        // so we do not move the TensorHandle to m_PreImportedInputHandles as in AsyncEnabled networks
-                        ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, nullptr};
-                        m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin));
-                    }
-                    catch(armnn::UnimplementedException& e)
-                    {
-                        IgnoreUnused(e);
-                        // Method not implement, cannot use import tensor and have to use allocated data instead
-                        handler.UseAllocatedData();
-                    }
-                }
-            }
-            else
-            {
-                // Cannot import, use allocated data
-                handler.UseAllocatedData();
-                // Ensure that the workload get correct tensor
-                try
-                {
-                    m_WorkloadQueue[m_InputWorkloadSlotPairs[layerBindingId].first].get()->ReplaceInputTensorHandle(
-                        handler.GetData(), m_InputWorkloadSlotPairs[layerBindingId].second);
-                }
-                catch(armnn::UnimplementedException& e)
-                {
-                    IgnoreUnused(e);
-                }
-            }
 
+            if (outputTensorHandle->CanBeImported(passThroughTensorHandle->Map(), forceImportMemorySource)
+                && (outputTensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource)))
+            {
+                importedInputs.push_back(inputIndex);
+            }
+            passThroughTensorHandle->Unmap();
+
+            inputIndex++;
         }
 
         return importedInputs;
@@ -1363,7 +1439,7 @@
 std::vector<ImportedOutputId> LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors,
                                                            MemorySource forceImportMemorySource)
 {
-    if (!m_NetworkProperties.m_ExportEnabled)
+    if (!m_NetworkProperties.m_AsyncEnabled)
     {
         // Cannot import if import is not enabled and forceImportMemorySource is undefined
         if (forceImportMemorySource == MemorySource::Undefined)
@@ -1377,85 +1453,38 @@
         }
         std::vector<ImportedInputId> importedOutputs;
         Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
-        for (auto outputTensor : outputTensors)
+
+        unsigned int outputIndex = 0;
+        for (const BindableLayer* const outputLayer : graph.GetOutputLayers())
         {
-            auto layerBindingId = outputTensor.first;
-            auto it = std::find_if(graph.GetOutputLayers().begin(), graph.GetOutputLayers().end(), [=](auto* layer)
+            auto inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get();
+
+            if (!inputTensorHandle)
             {
-                return layer->GetBindingId() == layerBindingId;
+                outputIndex++;
+                continue;
+            }
+
+            auto layerBindingId = outputLayer->GetBindingId();
+            auto it = std::find_if(outputTensors.begin(), outputTensors.end(), [=] (const auto& outputTensor)
+            {
+                return outputTensor.first == layerBindingId;
             });
 
-            if (it == graph.GetOutputLayers().end())
+            if (it == outputTensors.end())
             {
-                throw MemoryImportException(fmt::format("ImportOutputs: Memory Import failed, "
-                                                        "unknown LayerBindingId: {}",
-                                                        layerBindingId));
+                outputIndex++;
+                continue;
             }
 
-            const Layer* layer = *it;
-            if (layer->GetType() != LayerType::Output)
-            {
-                throw InvalidArgumentException("ImportOutputs: given layer not an OutputLayer");
-            }
-
-            const OutputSlot* outputSlot = layer->GetInputSlots()[0].GetConnectedOutputSlot();
-            ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
-            ITensorHandleFactory::FactoryId importFactoryId =
-                m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
-            ITensorHandleFactory* importFactory =
-                m_TensorHandleFactoryRegistry.GetFactory(importFactoryId, forceImportMemorySource);
-            if (!importFactory)
-            {
-                throw MemoryImportException("ImportOutputs: Force Import failed, cannot find matching Import Factory");
-            }
-
-            OutputHandler& outputHandler =
-                const_cast<OutputHandler&>(layer->GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler());
-            outputHandler.SetAllocatedData();
-            ITensorHandle* inputTensorHandle = outputHandler.GetData();
-            outputHandler.CreateTensorHandles(*importFactory, false);
-            inputTensorHandle = outputHandler.GetData();
-
+            const auto outputTensor = *it;
             // Check if the output memory can be imported
-            if (inputTensorHandle->CanBeImported(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
+            if (inputTensorHandle->CanBeImported(outputTensor.second.GetMemoryArea(), forceImportMemorySource)
+                && inputTensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
             {
-                if (inputTensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
-                {
-                    try
-                    {
-                        m_WorkloadQueue[m_OutputWorkloadSlotPairs[layerBindingId].first].get()->
-                            ReplaceOutputTensorHandle(inputTensorHandle,
-                                                      m_OutputWorkloadSlotPairs[layerBindingId].second);
-                        importedOutputs.push_back(m_CurImportedOutputId++);
-                        // For force import, we want OutputHandler to own the TensorHandle,
-                        // so we do not move the TensorHandle to m_PreImportedOutputHandles as in AsyncEnabled networks
-                        ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, nullptr};
-                        m_PreImportedOutputHandles.push_back(std::move(importedTensorHandlePin));
-                    }
-                    catch(armnn::UnimplementedException& e)
-                    {
-                        IgnoreUnused(e);
-                        // Method not implement, cannot use import tensor and have to use allocated data instead
-                        outputHandler.UseAllocatedData();
-                    }
-                }
+                importedOutputs.push_back(outputIndex);
             }
-            else
-            {
-                // Cannot import, use allocated memory
-                outputHandler.UseAllocatedData();
-                // Ensure that the workload get correct tensor
-                try
-                {
-                    m_WorkloadQueue[m_OutputWorkloadSlotPairs[layerBindingId].first].get()->
-                            ReplaceOutputTensorHandle(outputHandler.GetData(),
-                                                      m_OutputWorkloadSlotPairs[layerBindingId].second);
-                }
-                catch(armnn::UnimplementedException& e)
-                {
-                    IgnoreUnused(e);
-                }
-            }
+            outputIndex++;
         }
         return importedOutputs;
     }
diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp
index f637dec..dc2f4dc 100644
--- a/src/armnn/LoadedNetwork.hpp
+++ b/src/armnn/LoadedNetwork.hpp
@@ -204,8 +204,21 @@
 
     // A set of vectors to record the workload queue indexes and their corresponding Input/Output Slot indexes
     // which are connected to Inputs and Outputs for the network.
-    std::unordered_map<LayerBindingId, std::pair<unsigned int, unsigned int>> m_InputWorkloadSlotPairs;
-    std::unordered_map<LayerBindingId, std::pair<unsigned int, unsigned int>> m_OutputWorkloadSlotPairs;
+    struct WorkloadIndices
+    {
+        unsigned int m_WorkloadIndex;
+        unsigned int m_SlotIndex;
+    };
+
+    struct OutputWorkloadIndices
+    {
+        WorkloadIndices m_OutputSlotIndices;
+        std::vector<WorkloadIndices> m_InputSlotIndices;
+    };
+    std::unordered_map<LayerBindingId, std::vector<WorkloadIndices>> m_InputWorkloadSlotPairs;
+    std::unordered_map<LayerBindingId, OutputWorkloadIndices> m_OutputWorkloadSlotPairs;
+    std::vector<bool> m_IsInputImported;
+    std::vector<bool> m_IsOutputImported;
 
 };
 
diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp
index 1abe0f3..a913681 100644
--- a/src/armnn/Runtime.cpp
+++ b/src/armnn/Runtime.cpp
@@ -630,15 +630,18 @@
     auto status = loadedNetwork->EnqueueWorkload(inputTensors, outputTensors,
                                                  preImportedInputIds, preImportedOutputIds);
 
+
+    // Check if we imported, if not there's no need to call the After EnqueueWorkload events
+    if (!preImportedInputIds.empty() || !preImportedOutputIds.empty())
+    {
+        // Call After EnqueueWorkload events
+        for (auto&& context : m_BackendContexts)
+        {
+            context.second->AfterEnqueueWorkload(networkId);
+        }
+    }
     ARMNN_LOG(info) << "Execution time: " << std::setprecision(2)
                     << std::fixed << armnn::GetTimeDuration(startTime).count() << " ms.";
-
-    // Call After EnqueueWorkload events
-    for (auto&& context : m_BackendContexts)
-    {
-        context.second->AfterEnqueueWorkload(networkId);
-    }
-
     return status;
 }
 
diff --git a/src/armnnTestUtils/CMakeLists.txt b/src/armnnTestUtils/CMakeLists.txt
index e33fed7..061dd90 100755
--- a/src/armnnTestUtils/CMakeLists.txt
+++ b/src/armnnTestUtils/CMakeLists.txt
@@ -11,7 +11,6 @@
         ../../include/armnnTestUtils/MockBackend.hpp
         ../../include/armnnTestUtils/MockMemoryManager.hpp
         ../../include/armnnTestUtils/MockTensorHandle.hpp
-        ../../include/armnnTestUtils/MockWorkloadFactoryHelper.hpp
         ../../include/armnnTestUtils/PredicateResult.hpp
         ../../include/armnnTestUtils/TensorCopyUtils.hpp
         ../../include/armnnTestUtils/WorkloadTestUtils.hpp
@@ -57,4 +56,4 @@
         ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
         RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
-add_library(Armnn::armnnTestUtils ALIAS armnnTestUtils)
\ No newline at end of file
+add_library(Armnn::armnnTestUtils ALIAS armnnTestUtils)
diff --git a/src/armnnTestUtils/UnitTests.hpp b/src/armnnTestUtils/UnitTests.hpp
index f560720..71324db 100644
--- a/src/armnnTestUtils/UnitTests.hpp
+++ b/src/armnnTestUtils/UnitTests.hpp
@@ -155,11 +155,11 @@
 {
     auto memoryManager = WorkloadFactoryHelper<FactoryType>::GetMemoryManager();
     FactoryType workloadFactory = WorkloadFactoryHelper<FactoryType>::GetFactory(memoryManager);
+    auto tensorHandleFactory = WorkloadFactoryHelper<FactoryType>::GetTensorHandleFactory(memoryManager);
 
     armnn::RefWorkloadFactory refWorkloadFactory;
-    auto tensorHandleFactory = WorkloadFactoryHelper<FactoryType>::GetTensorHandleFactory(memoryManager);
-    auto refTensorHandleFactory =
-        RefWorkloadFactoryHelper::GetTensorHandleFactory(memoryManager);
+    auto refMemoryManager = WorkloadFactoryHelper<armnn::RefWorkloadFactory>::GetMemoryManager();
+    auto refTensorHandleFactory = RefWorkloadFactoryHelper::GetTensorHandleFactory(refMemoryManager);
 
     auto testResult = (*testFunction)(
         workloadFactory, memoryManager, refWorkloadFactory, tensorHandleFactory, refTensorHandleFactory, args...);
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 385affa..fc48ffc 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -583,7 +583,6 @@
 void MemSyncQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
     ValidateNumInputs(workloadInfo, "MemSyncQueueDescriptor", 1);
-    ValidateNumOutputs(workloadInfo, "MemSyncQueueDescriptor" , 1);
 
     if (m_Inputs.size() != 1)
     {
diff --git a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
index c69a4a5..77901df 100644
--- a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
+++ b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
@@ -951,11 +951,12 @@
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
 
-    auto inputBuffer = reinterpret_cast<float*>(misalignedMemPtr);
-    for (int i = 0; i < 4; i++)
+    std::vector<float> inputData
     {
-        inputBuffer[i] = 1.0f + static_cast<float>(i);
-    }
+         1.0f, 2.0f, 3.0f, 4.0f
+    };
+
+    std::memcpy(misalignedMemPtr, inputData.data(), 4*sizeof(float));
 
     std::vector<float> outputData(4);
     // Check our output buffer is aligned
@@ -1129,9 +1130,11 @@
         // Check the output is correct
     }
     unsigned int index = 0;
+    std::vector<float> outputData(expectedOutput.size(), 0);
+    std::memcpy(outputData.data(), misalignedMemPtr, expectedOutput.size() * sizeof(float));
     for (auto outputValue : expectedOutput)
     {
-        CHECK(outputValue == reinterpret_cast<float*>(misalignedMemPtr)[index]);
+        CHECK(outputValue == outputData[index]);
         ++index;
     }
     std::free(memPtr);
@@ -1183,11 +1186,11 @@
     // Check if our pointer is truly misaligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
-    auto inputBuffer = reinterpret_cast<float*>(misalignedInputPtr);
-    for (int i = 0; i < 4; i++)
+    std::vector<float> inputData
     {
-        inputBuffer[i] = 1.0f + static_cast<float>(i);
-    }
+         1.0f, 2.0f, 3.0f, 4.0f
+    };
+    std::memcpy(misalignedInputPtr, inputData.data(), 4*sizeof(float));
 
     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
@@ -1238,9 +1241,11 @@
     }
     // Check the output is correct
     unsigned int index = 0;
-    for (auto outputValue : expectedOutput)
+    std::vector<float> outputData(expectedOutput.size(), 0);
+    std::memcpy(outputData.data(), misalignedOutputPtr, expectedOutput.size() * sizeof(float));
+    for (auto expectedValue : expectedOutput)
     {
-        CHECK(outputValue == reinterpret_cast<float*>(misalignedOutputPtr)[index]);
+        CHECK(expectedValue == outputData[index]);
         ++index;
     }
     std::free(inputMemPtr);
@@ -1356,11 +1361,13 @@
 
     // Check if our pointer is truly misaligned
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
-    auto inputBuffer = reinterpret_cast<float*>(misalignedInputPtr);
-    for (int i = 0; i < 4; i++)
+
+    std::vector<float> inputValues
     {
-        inputBuffer[i] = 2.0f + static_cast<float>(i);
-    }
+         2.0f, 3.0f, 4.0f, 5.0f
+    };
+
+    std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size()*sizeof(float));
 
     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
@@ -1411,9 +1418,11 @@
     }
     // Check the output is correct
     unsigned int index = 0;
+    std::vector<float> alignedOutputData(expectedMisalignedOutput.size(), 0);
+    std::memcpy(alignedOutputData.data(), misalignedOutputPtr, expectedMisalignedOutput.size() * sizeof(float));
     for (auto outputValue : expectedMisalignedOutput)
     {
-        CHECK(outputValue == reinterpret_cast<float*>(misalignedOutputPtr)[index]);
+        CHECK(outputValue == alignedOutputData[index]);
         ++index;
     }
     // Clean up to avoid interfering with other tests
@@ -1471,11 +1480,11 @@
     // Check if our pointer is truly misaligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
-    auto inputBuffer = reinterpret_cast<float*>(misalignedInputPtr);
-    for (int i = 0; i < 4; i++)
+    std::vector<float> inputValues
     {
-        inputBuffer[i] = 2.0f + static_cast<float>(i);
-    }
+         2.0f, 3.0f, 4.0f, 5.0f
+    };
+    std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size() * sizeof(float));
 
     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
@@ -1530,9 +1539,11 @@
     }
     // Check the output is correct
     unsigned int index = 0;
+    std::vector<float> alignedOutput(expectedMisalignedOutput.size());
+    std::memcpy(alignedOutput.data(), misalignedOutputPtr, expectedMisalignedOutput.size()*sizeof(float));
     for (auto outputValue : expectedMisalignedOutput)
     {
-        CHECK(outputValue == reinterpret_cast<float*>(misalignedOutputPtr)[index]);
+        CHECK(outputValue == alignedOutput[index]);
         ++index;
     }
     std::free(inputMemPtr);
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
index 992abc2..389605f 100644
--- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
@@ -124,4 +124,42 @@
     FreeTensorIfUnused(m_Beta);
 }
 
+void ClBatchNormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClBatchNormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClBatchNormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
index dc76703..d476636 100644
--- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
@@ -32,6 +32,12 @@
     using FloatWorkload<BatchNormalizationQueueDescriptor>::FloatWorkload;
     void Execute() const override;
 
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
 private:
     mutable arm_compute::CLBatchNormalizationLayer m_Layer;
 
@@ -41,6 +47,7 @@
     std::unique_ptr<arm_compute::CLTensor> m_Beta;
 
     void FreeUnusedTensors();
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
index 867770a..8ccf157 100644
--- a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
@@ -25,9 +25,13 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
+    // Create Proxy tensor and set the initial tensor handle to it
+    m_InputProxy = std::make_unique<ICLTensorProxy>(&input);
+    m_OutputProxy = std::make_unique<ICLTensorProxy>(&output);
+
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvertFp16ToFp32Workload_configure");
-        m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
+        m_Layer.configure(clCompileContext, m_InputProxy.get(), m_OutputProxy.get(), g_AclConvertPolicy, 0);
     }
 }
 
@@ -57,5 +61,45 @@
     return aclStatus;
 }
 
+void ClConvertFp16ToFp32Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClConvertFp16ToFp32Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClConvertFp16ToFp32Workload::Reconfigure()
+{
+    arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+    m_InputProxy->set(&input);
+    m_OutputProxy->set(&output);
+}
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
index b392c0b..3c6fcd6 100644
--- a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
@@ -9,6 +9,8 @@
 
 #include <arm_compute/runtime/CL/functions/CLDepthConvertLayer.h>
 
+#include <cl/ICLTensorProxy.hpp>
+
 namespace armnn
 {
 
@@ -21,8 +23,19 @@
                                 const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
+    bool SupportsTensorHandleReplacement() const override { return true;};
+
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLDepthConvertLayer m_Layer;
+    virtual void Reconfigure();
+
+    std::unique_ptr<ICLTensorProxy> m_InputProxy;
+    std::unique_ptr<ICLTensorProxy> m_OutputProxy;
 };
 
 arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, const TensorInfo& output);
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
index 017fcaf..a44a80c 100644
--- a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
@@ -25,9 +25,13 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
+    // Create Proxy tensor and set the initial tensor handle to it
+    m_InputProxy = std::make_unique<ICLTensorProxy>(&input);
+    m_OutputProxy = std::make_unique<ICLTensorProxy>(&output);
+
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvertFp32ToFp16Workload_configure");
-        m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
+        m_Layer.configure(clCompileContext, m_InputProxy.get(), m_OutputProxy.get(), g_AclConvertPolicy, 0);
     }
 }
 
@@ -57,5 +61,45 @@
     return aclStatus;
 }
 
+void ClConvertFp32ToFp16Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClConvertFp32ToFp16Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClConvertFp32ToFp16Workload::Reconfigure()
+{
+    arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+    m_InputProxy->set(&input);
+    m_OutputProxy->set(&output);
+}
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
index 1d777b5..6ce563e 100644
--- a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
@@ -9,6 +9,8 @@
 
 #include <arm_compute/runtime/CL/functions/CLDepthConvertLayer.h>
 
+#include <cl/ICLTensorProxy.hpp>
+
 namespace armnn
 {
 
@@ -21,8 +23,19 @@
                                 const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
+    bool SupportsTensorHandleReplacement() const override { return true;};
+
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLDepthConvertLayer m_Layer;
+    virtual void Reconfigure();
+
+    std::unique_ptr<ICLTensorProxy> m_InputProxy;
+    std::unique_ptr<ICLTensorProxy> m_OutputProxy;
 };
 
 arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, const TensorInfo& output);
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
index cdfa885..bf82fbf 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
@@ -180,9 +180,9 @@
 
 void ClConvolution2dWorkload::Reconfigure()
 {
-    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_Reconfigure");
     arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
     m_InputProxy->set(&input);
     m_OutputProxy->set(&output);
 }
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
index 891d509..e4177e4 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
@@ -40,6 +40,8 @@
 
     arm_compute::ConvolutionMethod GetConvolutionMethod() const;
 
+    bool SupportsTensorHandleReplacement() const override { return true;};
+
 protected:
     void Reconfigure() override;
 
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
index 679e225..0aae1a3 100644
--- a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
@@ -29,7 +29,6 @@
 
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
-
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClFloorFloatWorkload_configure");
         m_Layer.configure(clCompileContext, &input, &output);
@@ -42,4 +41,42 @@
     RunClFunction(m_Layer, CHECK_LOCATION());
 }
 
+void ClFloorFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClFloorFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClFloorFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.hpp b/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
index 5740c68..dbe5f6f 100644
--- a/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
@@ -23,9 +23,14 @@
                          const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLFloor m_Layer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
index b34153f..d120fb2 100644
--- a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
@@ -60,4 +60,42 @@
     RunClFunction(m_Layer, CHECK_LOCATION());
 }
 
+void ClL2NormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClL2NormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClL2NormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
index cfa1a97..67e7b8b 100644
--- a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
@@ -24,10 +24,16 @@
                                    const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
+
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
 private:
     // Purposely not a CLL2Normalize function. See constructor.
     mutable arm_compute::CLL2NormalizeLayer m_Layer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
index d8d95f5..37dfab6 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
@@ -446,4 +446,42 @@
     FreeTensorIfUnused(m_OutputLayerNormWeightsTensor);
 }
 
+void ClLstmFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClLstmFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClLstmFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.hpp b/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
index b9faca8..54c5c60 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
@@ -22,9 +22,14 @@
                         const WorkloadInfo& info,
                         const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLLSTMLayer m_LstmLayer;
+    virtual void Reconfigure();
 
     std::unique_ptr<arm_compute::CLTensor> m_InputToInputWeightsTensor;
     std::unique_ptr<arm_compute::CLTensor> m_InputToForgetWeightsTensor;
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
index d98532d..8de8dd5 100644
--- a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
@@ -62,4 +62,42 @@
     RunClFunction(m_NormalizationLayer, CHECK_LOCATION());
 }
 
+void ClNormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void ClNormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void ClNormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
index 40b2693..d9db0f2 100644
--- a/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
@@ -23,9 +23,14 @@
                                  const WorkloadInfo& info,
                                  const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::CLNormalizationLayer    m_NormalizationLayer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/NeonTimer.cpp b/src/backends/neon/NeonTimer.cpp
index 5cce051..a7d3032 100644
--- a/src/backends/neon/NeonTimer.cpp
+++ b/src/backends/neon/NeonTimer.cpp
@@ -7,6 +7,7 @@
 #include "NeonInterceptorScheduler.hpp"
 
 #include <armnn/utility/Assert.hpp>
+#include <armnn/utility/PolymorphicDowncast.hpp>
 
 #include <memory>
 
@@ -29,7 +30,7 @@
     {
         // Keep the real schedule and add NeonInterceptorScheduler as an interceptor
         m_RealScheduler  = &arm_compute::Scheduler::get();
-        arm_compute::Scheduler::set(std::static_pointer_cast<arm_compute::IScheduler>(g_Interceptor));
+        arm_compute::Scheduler::set(armnn::PolymorphicPointerDowncast<arm_compute::IScheduler>(g_Interceptor));
     }
 }
 
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
index dcef025..7a2ff9a 100644
--- a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
@@ -40,4 +40,42 @@
     }
 }
 
+void NeonConvertBf16ToFp32Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonConvertBf16ToFp32Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonConvertBf16ToFp32Workload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
index 9770fbd..9d44ad2 100644
--- a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
@@ -17,10 +17,15 @@
 public:
     NeonConvertBf16ToFp32Workload(const ConvertBf16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
index 1b9e1bc..ce6c785 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.cpp
@@ -40,4 +40,42 @@
     }
 }
 
+void NeonConvertFp16ToFp32Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonConvertFp16ToFp32Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonConvertFp16ToFp32Workload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
index 9159e51..c0165ea 100644
--- a/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertFp16ToFp32Workload.hpp
@@ -17,10 +17,15 @@
 public:
     NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
index ac6a69d..acd1a1e 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
@@ -41,4 +41,42 @@
     }
 }
 
+void NeonConvertFp32ToBf16Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonConvertFp32ToBf16Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonConvertFp32ToBf16Workload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
index 6c01187..2304f8a 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
@@ -17,10 +17,15 @@
 public:
     NeonConvertFp32ToBf16Workload(const ConvertFp32ToBf16QueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
index d65cba0..089716a 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.cpp
@@ -41,4 +41,42 @@
     }
 }
 
+void NeonConvertFp32ToFp16Workload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonConvertFp32ToFp16Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonConvertFp32ToFp16Workload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
index 8e9f11b..666f487 100644
--- a/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
+++ b/src/backends/neon/workloads/NeonConvertFp32ToFp16Workload.hpp
@@ -17,10 +17,15 @@
 public:
     NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
     std::vector<TensorHandlePair> m_TensorHandlePairs;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
index b97e3ce..1d53245 100644
--- a/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.cpp
@@ -32,6 +32,45 @@
     ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonFloorFloatWorkload_Execute", this->GetGuid());
     m_Layer->run();
 }
+
+void NeonFloorFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonFloorFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonFloorFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
 
 
diff --git a/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp b/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
index 7113931..8ba6b4a 100644
--- a/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonFloorFloatWorkload.hpp
@@ -20,9 +20,14 @@
 public:
     NeonFloorFloatWorkload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     std::unique_ptr<arm_compute::IFunction> m_Layer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
index 887f25a..c0c6ed4 100644
--- a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.cpp
@@ -60,4 +60,42 @@
     m_Layer->run();
 }
 
+void NeonL2NormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonL2NormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonL2NormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
index 82f0639..9c591fc 100644
--- a/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonL2NormalizationFloatWorkload.hpp
@@ -26,9 +26,14 @@
     NeonL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info,
                                      std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     std::unique_ptr<arm_compute::IFunction> m_Layer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
index b8224e6..2f14ab9 100644
--- a/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.cpp
@@ -464,4 +464,42 @@
     FreeTensorIfUnused(m_OutputLayerNormWeightsTensor);
 }
 
+void NeonLstmFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonLstmFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonLstmFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp b/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp
index ebbf180..4bb3ff8 100644
--- a/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonLstmFloatWorkload.hpp
@@ -21,7 +21,11 @@
 public:
     NeonLstmFloatWorkload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     mutable arm_compute::NELSTMLayer m_LstmLayer;
 
@@ -51,6 +55,7 @@
     std::unique_ptr<arm_compute::Tensor> m_OutputLayerNormWeightsTensor;
 
     void FreeUnusedTensors();
+    virtual void Reconfigure();
 };
 
 arm_compute::Status NeonLstmFloatWorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn,
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
index f811a04..01ac5c1 100644
--- a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.cpp
@@ -110,4 +110,42 @@
     m_NormalizationLayer->run();
 }
 
+void NeonNormalizationFloatWorkload::ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+// Replace output tensor handle with the given TensorHandle
+void NeonNormalizationFloatWorkload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
+{
+    ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
+    this->m_Data.m_Inputs[slot] = tensorHandle;
+    try
+    {
+        Reconfigure();
+    }
+    catch(armnn::UnimplementedException& e)
+    {
+        // Cannot reconfigure, revert the slot back and throw the exception.
+        this->m_Data.m_Inputs[slot] = backupHandle;
+        throw e;
+    }
+}
+
+void NeonNormalizationFloatWorkload::Reconfigure()
+{
+    throw armnn::UnimplementedException("Reconfigure not implemented for this workload");
+}
+
 } //namespace armnn
diff --git a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
index ed54536..9605ed1 100644
--- a/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
+++ b/src/backends/neon/workloads/NeonNormalizationFloatWorkload.hpp
@@ -26,9 +26,14 @@
     NeonNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info,
                                    std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
     virtual void Execute() const override;
+    // Replace input tensor handle with the given TensorHandle
+    void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 
+    // Replace output tensor handle with the given TensorHandle
+    void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override;
 private:
     std::unique_ptr<arm_compute::IFunction> m_NormalizationLayer;
+    virtual void Reconfigure();
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/test/RefWorkloadFactoryHelper.hpp b/src/backends/reference/test/RefWorkloadFactoryHelper.hpp
index e413d04..f0a842d 100644
--- a/src/backends/reference/test/RefWorkloadFactoryHelper.hpp
+++ b/src/backends/reference/test/RefWorkloadFactoryHelper.hpp
@@ -7,6 +7,8 @@
 
 #include <backendsCommon/test/WorkloadFactoryHelper.hpp>
 
+#include <armnn/utility/PolymorphicDowncast.hpp>
+
 #include <reference/RefBackend.hpp>
 #include <reference/RefWorkloadFactory.hpp>
 #include "reference/RefTensorHandleFactory.hpp"
@@ -34,7 +36,7 @@
             const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr)
     {
 
-        return armnn::RefTensorHandleFactory(std::static_pointer_cast<armnn::RefMemoryManager>(memoryManager));
+        return armnn::RefTensorHandleFactory(armnn::PolymorphicPointerDowncast<armnn::RefMemoryManager>(memoryManager));
     }
 };
 
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 60d8255..46c2706 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -68,6 +68,7 @@
     RefActivationWorkload.hpp
     RefArgMinMaxWorkload.cpp
     RefArgMinMaxWorkload.hpp
+    RefBaseWorkload.hpp
     RefBatchNormalizationWorkload.cpp
     RefBatchNormalizationWorkload.hpp
     RefBatchToSpaceNdWorkload.cpp
diff --git a/src/backends/reference/workloads/RefActivationWorkload.hpp b/src/backends/reference/workloads/RefActivationWorkload.hpp
index 9814ac1..8dc2d52 100644
--- a/src/backends/reference/workloads/RefActivationWorkload.hpp
+++ b/src/backends/reference/workloads/RefActivationWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefActivationWorkload : public BaseWorkload<ActivationQueueDescriptor>
+class RefActivationWorkload : public RefBaseWorkload<ActivationQueueDescriptor>
 {
 public:
-    using BaseWorkload<ActivationQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ActivationQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor) override;
 
diff --git a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
index 2d635bf..d724273 100644
--- a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
+++ b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
@@ -16,7 +16,7 @@
 RefArgMinMaxWorkload::RefArgMinMaxWorkload(
         const ArgMinMaxQueueDescriptor& descriptor,
         const WorkloadInfo& info)
-        : BaseWorkload<ArgMinMaxQueueDescriptor>(descriptor, info) {}
+        : RefBaseWorkload<ArgMinMaxQueueDescriptor>(descriptor, info) {}
 
 
 void RefArgMinMaxWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp b/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
index f3c2644..97c4b45 100644
--- a/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
@@ -5,12 +5,12 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
-class RefArgMinMaxWorkload : public BaseWorkload<ArgMinMaxQueueDescriptor>
+class RefArgMinMaxWorkload : public RefBaseWorkload<ArgMinMaxQueueDescriptor>
 {
 public:
     explicit RefArgMinMaxWorkload(const ArgMinMaxQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefBaseWorkload.hpp b/src/backends/reference/workloads/RefBaseWorkload.hpp
new file mode 100644
index 0000000..824b4cc
--- /dev/null
+++ b/src/backends/reference/workloads/RefBaseWorkload.hpp
@@ -0,0 +1,36 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/backends/Workload.hpp>
+
+namespace armnn
+{
+    template <typename QueueDescriptor>
+    class RefBaseWorkload : public BaseWorkload<QueueDescriptor>
+    {
+    public:
+        RefBaseWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info)
+                : BaseWorkload<QueueDescriptor>(descriptor, info)
+        {}
+
+        virtual bool SupportsTensorHandleReplacement()  const override
+        {
+            return true;
+        }
+        // Replace input tensor handle with the given TensorHandle
+        void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
+        {
+            this->m_Data.m_Inputs[slot] = tensorHandle;
+        }
+
+        // Replace output tensor handle with the given TensorHandle
+        void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
+        {
+            this->m_Data.m_Outputs[slot] = tensorHandle;
+        }
+    };
+} //namespace armnn
\ No newline at end of file
diff --git a/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp b/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
index 282374d..a6bd986 100644
--- a/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefBatchNormalizationWorkload::RefBatchNormalizationWorkload(const BatchNormalizationQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
     , m_Mean    (std::make_unique<ScopedTensorHandle>(*(descriptor.m_Mean)))
     , m_Variance(std::make_unique<ScopedTensorHandle>(*(descriptor.m_Variance)))
     , m_Beta    (std::make_unique<ScopedTensorHandle>(*(descriptor.m_Beta)))
diff --git a/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp b/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
index 305c0ce..60dd2a9 100644
--- a/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefBatchNormalizationWorkload : public BaseWorkload<BatchNormalizationQueueDescriptor>
+class RefBatchNormalizationWorkload : public RefBaseWorkload<BatchNormalizationQueueDescriptor>
 {
 public:
     explicit RefBatchNormalizationWorkload(const BatchNormalizationQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
index 7d18c12..d7ee6fc 100644
--- a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
+++ b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn {
 
-class RefBatchToSpaceNdWorkload : public BaseWorkload<BatchToSpaceNdQueueDescriptor>
+class RefBatchToSpaceNdWorkload : public RefBaseWorkload<BatchToSpaceNdQueueDescriptor>
 {
 
 public:
-    using BaseWorkload<BatchToSpaceNdQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<BatchToSpaceNdQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefCastWorkload.hpp b/src/backends/reference/workloads/RefCastWorkload.hpp
index ccafaaf..6f7e56a 100644
--- a/src/backends/reference/workloads/RefCastWorkload.hpp
+++ b/src/backends/reference/workloads/RefCastWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "RefWorkloadUtils.hpp"
 
@@ -13,10 +13,10 @@
 {
 
 
-class RefCastWorkload : public BaseWorkload<CastQueueDescriptor>
+class RefCastWorkload : public RefBaseWorkload<CastQueueDescriptor>
 {
 public:
-    using BaseWorkload<CastQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<CastQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefChannelShuffleWorkload.hpp b/src/backends/reference/workloads/RefChannelShuffleWorkload.hpp
index 0c80378..b459b87 100644
--- a/src/backends/reference/workloads/RefChannelShuffleWorkload.hpp
+++ b/src/backends/reference/workloads/RefChannelShuffleWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefChannelShuffleWorkload : public BaseWorkload<ChannelShuffleQueueDescriptor>
+class RefChannelShuffleWorkload : public RefBaseWorkload<ChannelShuffleQueueDescriptor>
 {
 public:
-    using BaseWorkload<ChannelShuffleQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ChannelShuffleQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor) override;
 
diff --git a/src/backends/reference/workloads/RefComparisonWorkload.cpp b/src/backends/reference/workloads/RefComparisonWorkload.cpp
index 03df7a4..433e3e8 100644
--- a/src/backends/reference/workloads/RefComparisonWorkload.cpp
+++ b/src/backends/reference/workloads/RefComparisonWorkload.cpp
@@ -21,7 +21,7 @@
 
 RefComparisonWorkload::RefComparisonWorkload(const ComparisonQueueDescriptor& desc,
                                              const WorkloadInfo& info)
-    : BaseWorkload<ComparisonQueueDescriptor>(desc, info)
+    : RefBaseWorkload<ComparisonQueueDescriptor>(desc, info)
 {}
 
 void RefComparisonWorkload::PostAllocationConfigure()
diff --git a/src/backends/reference/workloads/RefComparisonWorkload.hpp b/src/backends/reference/workloads/RefComparisonWorkload.hpp
index f2780c7..93cfd1f 100644
--- a/src/backends/reference/workloads/RefComparisonWorkload.hpp
+++ b/src/backends/reference/workloads/RefComparisonWorkload.hpp
@@ -7,16 +7,16 @@
 
 #include "BaseIterator.hpp"
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefComparisonWorkload : public BaseWorkload<ComparisonQueueDescriptor>
+class RefComparisonWorkload : public RefBaseWorkload<ComparisonQueueDescriptor>
 {
 public:
-    using BaseWorkload<ComparisonQueueDescriptor>::m_Data;
+    using RefBaseWorkload<ComparisonQueueDescriptor>::m_Data;
 
     RefComparisonWorkload(const ComparisonQueueDescriptor& descriptor, const WorkloadInfo& info);
     void PostAllocationConfigure() override;
diff --git a/src/backends/reference/workloads/RefConcatWorkload.hpp b/src/backends/reference/workloads/RefConcatWorkload.hpp
index cb1ecf0..11d6d01 100644
--- a/src/backends/reference/workloads/RefConcatWorkload.hpp
+++ b/src/backends/reference/workloads/RefConcatWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefConcatWorkload : public BaseWorkload<ConcatQueueDescriptor>
+class RefConcatWorkload : public RefBaseWorkload<ConcatQueueDescriptor>
 {
 public:
-    using BaseWorkload<ConcatQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ConcatQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefConstantWorkload.cpp b/src/backends/reference/workloads/RefConstantWorkload.cpp
index 6290237..571dbb2 100644
--- a/src/backends/reference/workloads/RefConstantWorkload.cpp
+++ b/src/backends/reference/workloads/RefConstantWorkload.cpp
@@ -18,7 +18,7 @@
 
 RefConstantWorkload::RefConstantWorkload(
     const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info)
-    : BaseWorkload<ConstantQueueDescriptor>(descriptor, info) {}
+    : RefBaseWorkload<ConstantQueueDescriptor>(descriptor, info) {}
 
 void RefConstantWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefConstantWorkload.hpp b/src/backends/reference/workloads/RefConstantWorkload.hpp
index c158983..181d79d 100644
--- a/src/backends/reference/workloads/RefConstantWorkload.hpp
+++ b/src/backends/reference/workloads/RefConstantWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include <armnn/Types.hpp>
@@ -14,7 +14,7 @@
 {
 
 // Base class template providing an implementation of the Constant layer common to all data types.
-class RefConstantWorkload : public BaseWorkload<ConstantQueueDescriptor>
+class RefConstantWorkload : public RefBaseWorkload<ConstantQueueDescriptor>
 {
 public:
     RefConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
index b3af111..8b5c6d5 100644
--- a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
index acb1995..feb442e 100644
--- a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp b/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
index 97a138f..cd3cfa4 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
index 8cc822e..fe137ed 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
index 20c5c08..d57040e 100644
--- a/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
@@ -14,7 +14,7 @@
 {
 RefConvolution2dWorkload::RefConvolution2dWorkload(
     const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
-    : BaseWorkload<Convolution2dQueueDescriptor>(descriptor, info)
+    : RefBaseWorkload<Convolution2dQueueDescriptor>(descriptor, info)
 {
     WorkloadInfo detailsInfo;
     detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
diff --git a/src/backends/reference/workloads/RefConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
index 880547d..3335782 100644
--- a/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
@@ -13,7 +13,7 @@
 namespace armnn
 {
 
-class RefConvolution2dWorkload : public BaseWorkload<Convolution2dQueueDescriptor>
+class RefConvolution2dWorkload : public RefBaseWorkload<Convolution2dQueueDescriptor>
 {
 public:
     explicit RefConvolution2dWorkload(const Convolution2dQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefConvolution3dWorkload.cpp b/src/backends/reference/workloads/RefConvolution3dWorkload.cpp
index afab88f..5f54280 100644
--- a/src/backends/reference/workloads/RefConvolution3dWorkload.cpp
+++ b/src/backends/reference/workloads/RefConvolution3dWorkload.cpp
@@ -14,7 +14,7 @@
 {
 RefConvolution3dWorkload::RefConvolution3dWorkload(
     const Convolution3dQueueDescriptor& descriptor, const WorkloadInfo& info)
-    : BaseWorkload<Convolution3dQueueDescriptor>(descriptor, info)
+    : RefBaseWorkload<Convolution3dQueueDescriptor>(descriptor, info)
 {
     WorkloadInfo detailsInfo;
     detailsInfo.m_InputTensorInfos = info.m_InputTensorInfos;
diff --git a/src/backends/reference/workloads/RefConvolution3dWorkload.hpp b/src/backends/reference/workloads/RefConvolution3dWorkload.hpp
index 53ce309..6c74675 100644
--- a/src/backends/reference/workloads/RefConvolution3dWorkload.hpp
+++ b/src/backends/reference/workloads/RefConvolution3dWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
@@ -13,7 +13,7 @@
 namespace armnn
 {
 
-class RefConvolution3dWorkload : public BaseWorkload<Convolution3dQueueDescriptor>
+class RefConvolution3dWorkload : public RefBaseWorkload<Convolution3dQueueDescriptor>
 {
 public:
     explicit RefConvolution3dWorkload(const Convolution3dQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefDebugWorkload.hpp b/src/backends/reference/workloads/RefDebugWorkload.hpp
index 66af9a0..a157959 100644
--- a/src/backends/reference/workloads/RefDebugWorkload.hpp
+++ b/src/backends/reference/workloads/RefDebugWorkload.hpp
@@ -7,7 +7,7 @@
 
 #include <armnn/TypesUtils.hpp>
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
diff --git a/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp b/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
index 854a564..bd179d3 100644
--- a/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
+++ b/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
@@ -5,15 +5,15 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
 
-class RefDepthToSpaceWorkload : public BaseWorkload<DepthToSpaceQueueDescriptor>
+class RefDepthToSpaceWorkload : public RefBaseWorkload<DepthToSpaceQueueDescriptor>
 {
 public:
-    using BaseWorkload<DepthToSpaceQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<DepthToSpaceQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
index b447d1a..ad5edde 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
@@ -17,7 +17,7 @@
 
 RefDepthwiseConvolution2dWorkload::RefDepthwiseConvolution2dWorkload(
         const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info)
-        : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
+        : RefBaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
     m_Weight = std::make_unique<ScopedTensorHandle>(*(descriptor.m_Weight));
     const TensorInfo& rFilterInfo = m_Weight->GetTensorInfo();
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
index ae93d03..5d4b483 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
@@ -2,7 +2,7 @@
 // Copyright © 2017 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
@@ -12,7 +12,7 @@
 namespace armnn
 {
 
-class RefDepthwiseConvolution2dWorkload : public BaseWorkload<DepthwiseConvolution2dQueueDescriptor> {
+class RefDepthwiseConvolution2dWorkload : public RefBaseWorkload<DepthwiseConvolution2dQueueDescriptor> {
 public:
     explicit RefDepthwiseConvolution2dWorkload(const DepthwiseConvolution2dQueueDescriptor &descriptor,
                                                const WorkloadInfo &info);
diff --git a/src/backends/reference/workloads/RefDequantizeWorkload.hpp b/src/backends/reference/workloads/RefDequantizeWorkload.hpp
index 285c649..8fa8951 100644
--- a/src/backends/reference/workloads/RefDequantizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefDequantizeWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
 
-class RefDequantizeWorkload : public BaseWorkload<DequantizeQueueDescriptor>
+class RefDequantizeWorkload : public RefBaseWorkload<DequantizeQueueDescriptor>
 {
 public:
-    using BaseWorkload<DequantizeQueueDescriptor>::m_Data;
-    using BaseWorkload<DequantizeQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<DequantizeQueueDescriptor>::m_Data;
+    using RefBaseWorkload<DequantizeQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
index 4bc9eb1..5f01db3 100644
--- a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefDetectionPostProcessWorkload::RefDetectionPostProcessWorkload(
         const DetectionPostProcessQueueDescriptor& descriptor, const WorkloadInfo& info)
-        : BaseWorkload<DetectionPostProcessQueueDescriptor>(descriptor, info),
+        : RefBaseWorkload<DetectionPostProcessQueueDescriptor>(descriptor, info),
           m_Anchors(std::make_unique<ScopedTensorHandle>(*(descriptor.m_Anchors))) {}
 
 void RefDetectionPostProcessWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
index 4c3ad42..53b2971 100644
--- a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefDetectionPostProcessWorkload : public BaseWorkload<DetectionPostProcessQueueDescriptor>
+class RefDetectionPostProcessWorkload : public RefBaseWorkload<DetectionPostProcessQueueDescriptor>
 {
 public:
     explicit RefDetectionPostProcessWorkload(const DetectionPostProcessQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
index be15363..3ea51b9 100644
--- a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
@@ -27,7 +27,7 @@
 
 RefElementwiseUnaryWorkload::RefElementwiseUnaryWorkload(const ElementwiseUnaryQueueDescriptor& desc,
                                                          const WorkloadInfo& info)
-    : BaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
+    : RefBaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
 {}
 
 void RefElementwiseUnaryWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
index e055fd0..91229b3 100644
--- a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
@@ -7,16 +7,16 @@
 
 #include "BaseIterator.hpp"
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefElementwiseUnaryWorkload : public BaseWorkload<ElementwiseUnaryQueueDescriptor>
+class RefElementwiseUnaryWorkload : public RefBaseWorkload<ElementwiseUnaryQueueDescriptor>
 {
 public:
-    using BaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
+    using RefBaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
 
     RefElementwiseUnaryWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
diff --git a/src/backends/reference/workloads/RefElementwiseWorkload.cpp b/src/backends/reference/workloads/RefElementwiseWorkload.cpp
index dd7d325..d14ce07 100644
--- a/src/backends/reference/workloads/RefElementwiseWorkload.cpp
+++ b/src/backends/reference/workloads/RefElementwiseWorkload.cpp
@@ -21,7 +21,7 @@
 RefElementwiseWorkload<Functor, ParentDescriptor, DebugString>::RefElementwiseWorkload(
     const ParentDescriptor& desc,
     const WorkloadInfo& info)
-    : BaseWorkload<ParentDescriptor>(desc, info)
+    : RefBaseWorkload<ParentDescriptor>(desc, info)
 {
 }
 
diff --git a/src/backends/reference/workloads/RefElementwiseWorkload.hpp b/src/backends/reference/workloads/RefElementwiseWorkload.hpp
index 4b108e4..065a783 100644
--- a/src/backends/reference/workloads/RefElementwiseWorkload.hpp
+++ b/src/backends/reference/workloads/RefElementwiseWorkload.hpp
@@ -6,7 +6,7 @@
 #pragma once
 
 #include <armnn/Types.hpp>
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "BaseIterator.hpp"
 #include "ElementwiseFunction.hpp"
@@ -18,12 +18,12 @@
 {
 
 template <typename Functor, typename ParentDescriptor, typename armnn::StringMapping::Id DebugString>
-class RefElementwiseWorkload : public BaseWorkload<ParentDescriptor>
+class RefElementwiseWorkload : public RefBaseWorkload<ParentDescriptor>
 {
 public:
     using InType = typename ElementwiseBinaryFunction<Functor>::InType;
     using OutType = typename ElementwiseBinaryFunction<Functor>::OutType;
-    using BaseWorkload<ParentDescriptor>::m_Data;
+    using RefBaseWorkload<ParentDescriptor>::m_Data;
 
     RefElementwiseWorkload(const ParentDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
diff --git a/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp b/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
index 53b3375..85dc6af 100644
--- a/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
+++ b/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
diff --git a/src/backends/reference/workloads/RefFillWorkload.hpp b/src/backends/reference/workloads/RefFillWorkload.hpp
index 56d44b8..d1e0058 100644
--- a/src/backends/reference/workloads/RefFillWorkload.hpp
+++ b/src/backends/reference/workloads/RefFillWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefFillWorkload : public BaseWorkload<FillQueueDescriptor>
+class RefFillWorkload : public RefBaseWorkload<FillQueueDescriptor>
 {
 public:
-    using BaseWorkload<FillQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<FillQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefFloorWorkload.hpp b/src/backends/reference/workloads/RefFloorWorkload.hpp
index 1a532f7..6237ff0 100644
--- a/src/backends/reference/workloads/RefFloorWorkload.hpp
+++ b/src/backends/reference/workloads/RefFloorWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefFloorWorkload : public BaseWorkload<FloorQueueDescriptor>
+class RefFloorWorkload : public RefBaseWorkload<FloorQueueDescriptor>
 {
 public:
-    using BaseWorkload<FloorQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<FloorQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
index 5a7951e..c6ea147 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
@@ -14,7 +14,7 @@
 {
 RefFullyConnectedWorkload::RefFullyConnectedWorkload(
     const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info)
-        : BaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
+        : RefBaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
 {
 }
 
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
index 3ee4a4a..432a887 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "BaseIterator.hpp"
 #include "Decoders.hpp"
@@ -15,7 +15,7 @@
 namespace armnn
 {
 
-class RefFullyConnectedWorkload : public BaseWorkload<FullyConnectedQueueDescriptor>
+class RefFullyConnectedWorkload : public RefBaseWorkload<FullyConnectedQueueDescriptor>
 {
 public:
     explicit RefFullyConnectedWorkload(const FullyConnectedQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefGatherWorkload.hpp b/src/backends/reference/workloads/RefGatherWorkload.hpp
index a2698e3..ec880a5 100644
--- a/src/backends/reference/workloads/RefGatherWorkload.hpp
+++ b/src/backends/reference/workloads/RefGatherWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include <armnn/TypesUtils.hpp>
@@ -16,10 +16,10 @@
 namespace armnn
 {
 
-class RefGatherWorkload : public BaseWorkload<GatherQueueDescriptor>
+class RefGatherWorkload : public RefBaseWorkload<GatherQueueDescriptor>
 {
 public:
-    using BaseWorkload<GatherQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<GatherQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
index e642dc9..c103a6b 100644
--- a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
@@ -16,7 +16,7 @@
 RefInstanceNormalizationWorkload::RefInstanceNormalizationWorkload(
     const InstanceNormalizationQueueDescriptor& descriptor,
     const WorkloadInfo& info)
-    : BaseWorkload<InstanceNormalizationQueueDescriptor>(descriptor, info) {}
+    : RefBaseWorkload<InstanceNormalizationQueueDescriptor>(descriptor, info) {}
 
 void RefInstanceNormalizationWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
index 3283c44..a4b2dd3 100644
--- a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefInstanceNormalizationWorkload : public BaseWorkload<InstanceNormalizationQueueDescriptor>
+class RefInstanceNormalizationWorkload : public RefBaseWorkload<InstanceNormalizationQueueDescriptor>
 {
 public:
     explicit RefInstanceNormalizationWorkload(const InstanceNormalizationQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp b/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
index ca31503..f6fcff3 100644
--- a/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
@@ -22,7 +22,7 @@
 RefL2NormalizationWorkload::RefL2NormalizationWorkload(
         const L2NormalizationQueueDescriptor& descriptor,
         const WorkloadInfo& info)
-    : BaseWorkload<L2NormalizationQueueDescriptor>(descriptor, info) {}
+    : RefBaseWorkload<L2NormalizationQueueDescriptor>(descriptor, info) {}
 
 void RefL2NormalizationWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp b/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
index dd129c6..c64e2ea 100644
--- a/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefL2NormalizationWorkload : public BaseWorkload<L2NormalizationQueueDescriptor>
+class RefL2NormalizationWorkload : public RefBaseWorkload<L2NormalizationQueueDescriptor>
 {
 public:
     explicit RefL2NormalizationWorkload(const L2NormalizationQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp b/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
index 9f87def..91ad5f6 100644
--- a/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefLogSoftmaxWorkload : public BaseWorkload<LogSoftmaxQueueDescriptor>
+class RefLogSoftmaxWorkload : public RefBaseWorkload<LogSoftmaxQueueDescriptor>
 {
 public:
-    using BaseWorkload<LogSoftmaxQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<LogSoftmaxQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp b/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
index f187e0c..f0cb846 100644
--- a/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
@@ -19,7 +19,7 @@
 
 RefLogicalBinaryWorkload::RefLogicalBinaryWorkload(const LogicalBinaryQueueDescriptor& desc,
                                                    const WorkloadInfo& info)
-    : BaseWorkload<LogicalBinaryQueueDescriptor>(desc, info)
+    : RefBaseWorkload<LogicalBinaryQueueDescriptor>(desc, info)
 {}
 
 void RefLogicalBinaryWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp b/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
index 053de7d..797d937 100644
--- a/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
@@ -7,16 +7,16 @@
 
 #include "BaseIterator.hpp"
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefLogicalBinaryWorkload : public BaseWorkload<LogicalBinaryQueueDescriptor>
+class RefLogicalBinaryWorkload : public RefBaseWorkload<LogicalBinaryQueueDescriptor>
 {
 public:
-    using BaseWorkload<LogicalBinaryQueueDescriptor>::m_Data;
+    using RefBaseWorkload<LogicalBinaryQueueDescriptor>::m_Data;
 
     RefLogicalBinaryWorkload(const LogicalBinaryQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
diff --git a/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp b/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
index bef2bdc..ec0aa0e 100644
--- a/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
@@ -19,7 +19,7 @@
 
 RefLogicalUnaryWorkload::RefLogicalUnaryWorkload(const ElementwiseUnaryQueueDescriptor& desc,
                                                  const WorkloadInfo& info)
-    : BaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
+    : RefBaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
 {}
 
 void RefLogicalUnaryWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp b/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
index 008d24f..ebd5826 100644
--- a/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
@@ -7,16 +7,16 @@
 
 #include "BaseIterator.hpp"
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefLogicalUnaryWorkload : public BaseWorkload<ElementwiseUnaryQueueDescriptor>
+class RefLogicalUnaryWorkload : public RefBaseWorkload<ElementwiseUnaryQueueDescriptor>
 {
 public:
-    using BaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
+    using RefBaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
 
     RefLogicalUnaryWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
diff --git a/src/backends/reference/workloads/RefLstmWorkload.cpp b/src/backends/reference/workloads/RefLstmWorkload.cpp
index 1ff6f50..8609811 100644
--- a/src/backends/reference/workloads/RefLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefLstmWorkload.cpp
@@ -15,7 +15,7 @@
 {
 
 RefLstmWorkload::RefLstmWorkload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
-    : BaseWorkload<LstmQueueDescriptor>(descriptor, info)
+    : RefBaseWorkload<LstmQueueDescriptor>(descriptor, info)
     , m_InputToInputWeightsTensor     (AssignScopedTensorHandle(descriptor.m_InputToInputWeights))
     , m_InputToForgetWeightsTensor    (AssignScopedTensorHandle(descriptor.m_InputToForgetWeights))
     , m_InputToCellWeightsTensor      (AssignScopedTensorHandle(descriptor.m_InputToCellWeights))
diff --git a/src/backends/reference/workloads/RefLstmWorkload.hpp b/src/backends/reference/workloads/RefLstmWorkload.hpp
index 72f6360..57526c9 100644
--- a/src/backends/reference/workloads/RefLstmWorkload.hpp
+++ b/src/backends/reference/workloads/RefLstmWorkload.hpp
@@ -7,13 +7,13 @@
 
 #include <armnn/TypesUtils.hpp>
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefLstmWorkload : public BaseWorkload<LstmQueueDescriptor>
+class RefLstmWorkload : public RefBaseWorkload<LstmQueueDescriptor>
 {
 public:
     explicit RefLstmWorkload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefMeanWorkload.cpp b/src/backends/reference/workloads/RefMeanWorkload.cpp
index 7941ce2..23abaf8 100644
--- a/src/backends/reference/workloads/RefMeanWorkload.cpp
+++ b/src/backends/reference/workloads/RefMeanWorkload.cpp
@@ -16,7 +16,7 @@
 {
 
 RefMeanWorkload::RefMeanWorkload(const MeanQueueDescriptor& descriptor, const WorkloadInfo& info)
-  :BaseWorkload<MeanQueueDescriptor>(descriptor, info) {}
+  :RefBaseWorkload<MeanQueueDescriptor>(descriptor, info) {}
 
 void RefMeanWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefMeanWorkload.hpp b/src/backends/reference/workloads/RefMeanWorkload.hpp
index 2825d66..c4c6a12 100644
--- a/src/backends/reference/workloads/RefMeanWorkload.hpp
+++ b/src/backends/reference/workloads/RefMeanWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "Decoders.hpp"
@@ -14,7 +14,7 @@
 namespace armnn
 {
 
-class RefMeanWorkload : public BaseWorkload<MeanQueueDescriptor>
+class RefMeanWorkload : public RefBaseWorkload<MeanQueueDescriptor>
 {
 public:
     explicit RefMeanWorkload (const MeanQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefNormalizationWorkload.cpp b/src/backends/reference/workloads/RefNormalizationWorkload.cpp
index 36828ac..613868d 100644
--- a/src/backends/reference/workloads/RefNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefNormalizationWorkload.cpp
@@ -158,7 +158,7 @@
 
 RefNormalizationWorkload::RefNormalizationWorkload(const NormalizationQueueDescriptor& descriptor,
                                                    const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
 {}
 
 void RefNormalizationWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefNormalizationWorkload.hpp b/src/backends/reference/workloads/RefNormalizationWorkload.hpp
index b152072..5218e1e 100644
--- a/src/backends/reference/workloads/RefNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefNormalizationWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefNormalizationWorkload : public BaseWorkload<NormalizationQueueDescriptor>
+class RefNormalizationWorkload : public RefBaseWorkload<NormalizationQueueDescriptor>
 {
 public:
     explicit RefNormalizationWorkload(const NormalizationQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefPadWorkload.hpp b/src/backends/reference/workloads/RefPadWorkload.hpp
index 18c406a..c587105 100644
--- a/src/backends/reference/workloads/RefPadWorkload.hpp
+++ b/src/backends/reference/workloads/RefPadWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefPadWorkload : public BaseWorkload<PadQueueDescriptor>
+class RefPadWorkload : public RefBaseWorkload<PadQueueDescriptor>
 {
 public:
-    using BaseWorkload<PadQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<PadQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefPermuteWorkload.hpp b/src/backends/reference/workloads/RefPermuteWorkload.hpp
index 9424441..d1e4452 100644
--- a/src/backends/reference/workloads/RefPermuteWorkload.hpp
+++ b/src/backends/reference/workloads/RefPermuteWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 #include <armnn/TypesUtils.hpp>
 
diff --git a/src/backends/reference/workloads/RefPooling2dWorkload.hpp b/src/backends/reference/workloads/RefPooling2dWorkload.hpp
index 125fea8..a073e39 100644
--- a/src/backends/reference/workloads/RefPooling2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefPooling2dWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "Decoders.hpp"
@@ -13,10 +13,10 @@
 
 namespace armnn
 {
-class RefPooling2dWorkload : public BaseWorkload<Pooling2dQueueDescriptor>
+class RefPooling2dWorkload : public RefBaseWorkload<Pooling2dQueueDescriptor>
 {
 public:
-    using BaseWorkload<Pooling2dQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<Pooling2dQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefPooling3dWorkload.hpp b/src/backends/reference/workloads/RefPooling3dWorkload.hpp
index 911c438..92bc476 100644
--- a/src/backends/reference/workloads/RefPooling3dWorkload.hpp
+++ b/src/backends/reference/workloads/RefPooling3dWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "Decoders.hpp"
@@ -13,10 +13,10 @@
 
 namespace armnn
 {
-class RefPooling3dWorkload : public BaseWorkload<Pooling3dQueueDescriptor>
+class RefPooling3dWorkload : public RefBaseWorkload<Pooling3dQueueDescriptor>
 {
 public:
-    using BaseWorkload<Pooling3dQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<Pooling3dQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefPreluWorkload.cpp b/src/backends/reference/workloads/RefPreluWorkload.cpp
index c1d8de2..94eeea1 100644
--- a/src/backends/reference/workloads/RefPreluWorkload.cpp
+++ b/src/backends/reference/workloads/RefPreluWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefPreluWorkload::RefPreluWorkload(const PreluQueueDescriptor& descriptor,
                                    const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
 {}
 
 void RefPreluWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefPreluWorkload.hpp b/src/backends/reference/workloads/RefPreluWorkload.hpp
index b5c97df..51ba2c1 100644
--- a/src/backends/reference/workloads/RefPreluWorkload.hpp
+++ b/src/backends/reference/workloads/RefPreluWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefPreluWorkload : public BaseWorkload<PreluQueueDescriptor>
+class RefPreluWorkload : public RefBaseWorkload<PreluQueueDescriptor>
 {
 public:
     explicit RefPreluWorkload(const PreluQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefQLstmWorkload.cpp b/src/backends/reference/workloads/RefQLstmWorkload.cpp
index dc29d0b..74f5f1e 100644
--- a/src/backends/reference/workloads/RefQLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefQLstmWorkload.cpp
@@ -14,7 +14,7 @@
 {
 
 RefQLstmWorkload::RefQLstmWorkload(const QLstmQueueDescriptor &descriptor, const WorkloadInfo &info)
-        : BaseWorkload<QLstmQueueDescriptor>(descriptor, info)
+        : RefBaseWorkload<QLstmQueueDescriptor>(descriptor, info)
         , m_InputToInputWeightsTensor     (AssignScopedTensorHandle(descriptor.m_InputToInputWeights))
         , m_InputToForgetWeightsTensor    (AssignScopedTensorHandle(descriptor.m_InputToForgetWeights))
         , m_InputToCellWeightsTensor      (AssignScopedTensorHandle(descriptor.m_InputToCellWeights))
diff --git a/src/backends/reference/workloads/RefQLstmWorkload.hpp b/src/backends/reference/workloads/RefQLstmWorkload.hpp
index 093cfd1..0e64a38 100644
--- a/src/backends/reference/workloads/RefQLstmWorkload.hpp
+++ b/src/backends/reference/workloads/RefQLstmWorkload.hpp
@@ -7,13 +7,13 @@
 
 #include <armnn/TypesUtils.hpp>
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefQLstmWorkload : public BaseWorkload<QLstmQueueDescriptor>
+class RefQLstmWorkload : public RefBaseWorkload<QLstmQueueDescriptor>
 {
 public:
     explicit RefQLstmWorkload(const QLstmQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefQuantizeWorkload.cpp b/src/backends/reference/workloads/RefQuantizeWorkload.cpp
index 35791e6..10ef0e5 100644
--- a/src/backends/reference/workloads/RefQuantizeWorkload.cpp
+++ b/src/backends/reference/workloads/RefQuantizeWorkload.cpp
@@ -29,7 +29,7 @@
 } //namespace
 
 RefQuantizeWorkload::RefQuantizeWorkload(const QuantizeQueueDescriptor& descriptor, const WorkloadInfo &info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
     , m_NumElements(info.m_InputTensorInfos[0].GetNumElements())
 {
 }
diff --git a/src/backends/reference/workloads/RefQuantizeWorkload.hpp b/src/backends/reference/workloads/RefQuantizeWorkload.hpp
index a32efa7..e382410 100644
--- a/src/backends/reference/workloads/RefQuantizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefQuantizeWorkload.hpp
@@ -5,14 +5,14 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
 
 namespace armnn {
 
-class RefQuantizeWorkload : public BaseWorkload<QuantizeQueueDescriptor>
+class RefQuantizeWorkload : public RefBaseWorkload<QuantizeQueueDescriptor>
 {
 public:
     RefQuantizeWorkload(const QuantizeQueueDescriptor& descriptor, const WorkloadInfo &info);
diff --git a/src/backends/reference/workloads/RefRankWorkload.hpp b/src/backends/reference/workloads/RefRankWorkload.hpp
index e1f30c5..000828f 100644
--- a/src/backends/reference/workloads/RefRankWorkload.hpp
+++ b/src/backends/reference/workloads/RefRankWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "RefWorkloadUtils.hpp"
@@ -13,10 +13,10 @@
 namespace armnn
 {
 
-struct RefRankWorkload : public BaseWorkload<RankQueueDescriptor>
+struct RefRankWorkload : public RefBaseWorkload<RankQueueDescriptor>
 {
 public:
-    using BaseWorkload<RankQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<RankQueueDescriptor>::RefBaseWorkload;
     virtual void Execute() const override
     {
         Execute(m_Data.m_Inputs, m_Data.m_Outputs);
diff --git a/src/backends/reference/workloads/RefReduceWorkload.cpp b/src/backends/reference/workloads/RefReduceWorkload.cpp
index 821e828..62881da 100644
--- a/src/backends/reference/workloads/RefReduceWorkload.cpp
+++ b/src/backends/reference/workloads/RefReduceWorkload.cpp
@@ -16,7 +16,7 @@
 RefReduceWorkload::RefReduceWorkload(
     const ReduceQueueDescriptor& descriptor,
     const WorkloadInfo& info)
-    : BaseWorkload<ReduceQueueDescriptor>(descriptor, info) {}
+    : RefBaseWorkload<ReduceQueueDescriptor>(descriptor, info) {}
 
 void RefReduceWorkload::Execute() const
 {
diff --git a/src/backends/reference/workloads/RefReduceWorkload.hpp b/src/backends/reference/workloads/RefReduceWorkload.hpp
index d2280cc..d759bc2 100644
--- a/src/backends/reference/workloads/RefReduceWorkload.hpp
+++ b/src/backends/reference/workloads/RefReduceWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefReduceWorkload : public BaseWorkload<ReduceQueueDescriptor>
+class RefReduceWorkload : public RefBaseWorkload<ReduceQueueDescriptor>
 {
 public:
     explicit RefReduceWorkload(const ReduceQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefReshapeWorkload.hpp b/src/backends/reference/workloads/RefReshapeWorkload.hpp
index 26a86c1..7596685 100644
--- a/src/backends/reference/workloads/RefReshapeWorkload.hpp
+++ b/src/backends/reference/workloads/RefReshapeWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefReshapeWorkload : public BaseWorkload<ReshapeQueueDescriptor>
+class RefReshapeWorkload : public RefBaseWorkload<ReshapeQueueDescriptor>
 {
 public:
-    using BaseWorkload<ReshapeQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ReshapeQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefResizeWorkload.hpp b/src/backends/reference/workloads/RefResizeWorkload.hpp
index 82949ed..f774719 100644
--- a/src/backends/reference/workloads/RefResizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefResizeWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefResizeWorkload : public BaseWorkload<ResizeQueueDescriptor>
+class RefResizeWorkload : public RefBaseWorkload<ResizeQueueDescriptor>
 {
 public:
-    using BaseWorkload<ResizeQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ResizeQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefShapeWorkload.hpp b/src/backends/reference/workloads/RefShapeWorkload.hpp
index 209cccd..b7ed761 100644
--- a/src/backends/reference/workloads/RefShapeWorkload.hpp
+++ b/src/backends/reference/workloads/RefShapeWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "RefWorkloadUtils.hpp"
@@ -13,10 +13,10 @@
 namespace armnn
 {
 
-struct RefShapeWorkload : public BaseWorkload<ShapeQueueDescriptor>
+struct RefShapeWorkload : public RefBaseWorkload<ShapeQueueDescriptor>
 {
 public:
-    using BaseWorkload<ShapeQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<ShapeQueueDescriptor>::RefBaseWorkload;
     virtual void Execute() const override
     {
         Execute(m_Data.m_Inputs, m_Data.m_Outputs);
diff --git a/src/backends/reference/workloads/RefSliceWorkload.hpp b/src/backends/reference/workloads/RefSliceWorkload.hpp
index 69dae5a..b9dca86 100644
--- a/src/backends/reference/workloads/RefSliceWorkload.hpp
+++ b/src/backends/reference/workloads/RefSliceWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefSliceWorkload : public BaseWorkload<SliceQueueDescriptor>
+class RefSliceWorkload : public RefBaseWorkload<SliceQueueDescriptor>
 {
 public:
-    using BaseWorkload<SliceQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SliceQueueDescriptor>::RefBaseWorkload;
 
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
diff --git a/src/backends/reference/workloads/RefSoftmaxWorkload.hpp b/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
index 42dbb53..cac102a 100644
--- a/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
@@ -5,16 +5,16 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefSoftmaxWorkload : public BaseWorkload<SoftmaxQueueDescriptor>
+class RefSoftmaxWorkload : public RefBaseWorkload<SoftmaxQueueDescriptor>
 {
 public:
-    using BaseWorkload<SoftmaxQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SoftmaxQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
index ec764c7..eb2d93f 100644
--- a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
+++ b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
@@ -4,17 +4,17 @@
 //
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 #include <armnn/TypesUtils.hpp>
 
 namespace armnn
 {
 
-class RefSpaceToBatchNdWorkload : public BaseWorkload<SpaceToBatchNdQueueDescriptor>
+class RefSpaceToBatchNdWorkload : public RefBaseWorkload<SpaceToBatchNdQueueDescriptor>
 {
 public:
-    using BaseWorkload<SpaceToBatchNdQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SpaceToBatchNdQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp b/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
index bc71fde..17f8d2f 100644
--- a/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
+++ b/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
@@ -4,17 +4,17 @@
 //
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 #include <armnn/TypesUtils.hpp>
 
 namespace armnn
 {
 
-class RefSpaceToDepthWorkload : public BaseWorkload<SpaceToDepthQueueDescriptor>
+class RefSpaceToDepthWorkload : public RefBaseWorkload<SpaceToDepthQueueDescriptor>
 {
 public:
-    using BaseWorkload<SpaceToDepthQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SpaceToDepthQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefSplitterWorkload.hpp b/src/backends/reference/workloads/RefSplitterWorkload.hpp
index 28dc83d..0b72bb9 100644
--- a/src/backends/reference/workloads/RefSplitterWorkload.hpp
+++ b/src/backends/reference/workloads/RefSplitterWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 #include "Decoders.hpp"
 #include "Encoders.hpp"
@@ -13,10 +13,10 @@
 namespace armnn
 {
 
-class RefSplitterWorkload : public BaseWorkload<SplitterQueueDescriptor>
+class RefSplitterWorkload : public RefBaseWorkload<SplitterQueueDescriptor>
 {
 public:
-    using BaseWorkload<SplitterQueueDescriptor>::BaseWorkload;
+    using RefBaseWorkload<SplitterQueueDescriptor>::RefBaseWorkload;
     void Execute() const override;
     void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 private:
diff --git a/src/backends/reference/workloads/RefStackWorkload.cpp b/src/backends/reference/workloads/RefStackWorkload.cpp
index 3f7fd7b..f57e6e0 100644
--- a/src/backends/reference/workloads/RefStackWorkload.cpp
+++ b/src/backends/reference/workloads/RefStackWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefStackWorkload::RefStackWorkload(const StackQueueDescriptor& descriptor,
                                    const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
 {}
 
 void RefStackWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefStackWorkload.hpp b/src/backends/reference/workloads/RefStackWorkload.hpp
index fbca11b..19f4a7b 100644
--- a/src/backends/reference/workloads/RefStackWorkload.hpp
+++ b/src/backends/reference/workloads/RefStackWorkload.hpp
@@ -5,13 +5,13 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 namespace armnn
 {
 
-class RefStackWorkload : public BaseWorkload<StackQueueDescriptor>
+class RefStackWorkload : public RefBaseWorkload<StackQueueDescriptor>
 {
 public:
     explicit RefStackWorkload(const StackQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
index 336a687..41fe4c3 100644
--- a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
+++ b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
@@ -12,7 +12,7 @@
 
 RefStridedSliceWorkload::RefStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor,
                                                  const WorkloadInfo& info)
-    : BaseWorkload(descriptor, info)
+    : RefBaseWorkload(descriptor, info)
 {}
 
 void RefStridedSliceWorkload::Execute() const
diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
index d2ffca7..ea443cf 100644
--- a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
+++ b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
@@ -5,12 +5,12 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
 
-class RefStridedSliceWorkload : public BaseWorkload<StridedSliceQueueDescriptor>
+class RefStridedSliceWorkload : public RefBaseWorkload<StridedSliceQueueDescriptor>
 {
 public:
     RefStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor, const WorkloadInfo& info);
diff --git a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
index 8665648..64a2d4c 100644
--- a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
@@ -15,7 +15,7 @@
 
 RefTransposeConvolution2dWorkload::RefTransposeConvolution2dWorkload(
     const TransposeConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) :
-    BaseWorkload<TransposeConvolution2dQueueDescriptor>(descriptor, info)
+    RefBaseWorkload<TransposeConvolution2dQueueDescriptor>(descriptor, info)
 {
     // set up weights decoder
     m_Weights = std::make_unique<ScopedTensorHandle>(*(descriptor.m_Weight));
diff --git a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
index aa2546f..6bcee9a 100644
--- a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
@@ -9,12 +9,12 @@
 #include "Encoders.hpp"
 
 #include <armnn/backends/TensorHandle.hpp>
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 namespace armnn
 {
 
-class RefTransposeConvolution2dWorkload : public BaseWorkload<TransposeConvolution2dQueueDescriptor>
+class RefTransposeConvolution2dWorkload : public RefBaseWorkload<TransposeConvolution2dQueueDescriptor>
 {
 public:
     RefTransposeConvolution2dWorkload(const TransposeConvolution2dQueueDescriptor& descriptor,
diff --git a/src/backends/reference/workloads/RefTransposeWorkload.hpp b/src/backends/reference/workloads/RefTransposeWorkload.hpp
index bf59de7..b8c3649 100644
--- a/src/backends/reference/workloads/RefTransposeWorkload.hpp
+++ b/src/backends/reference/workloads/RefTransposeWorkload.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 
 #include <armnn/TypesUtils.hpp>
 
diff --git a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
index 311fa18..d447a46 100644
--- a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.cpp
@@ -19,7 +19,7 @@
 RefUnidirectionalSequenceLstmWorkload::RefUnidirectionalSequenceLstmWorkload(
     const UnidirectionalSequenceLstmQueueDescriptor& descriptor,
     const WorkloadInfo& info)
-    : BaseWorkload<UnidirectionalSequenceLstmQueueDescriptor>(descriptor, info)
+    : RefBaseWorkload<UnidirectionalSequenceLstmQueueDescriptor>(descriptor, info)
     , m_InputToInputWeightsTensor     (AssignScopedTensorHandle(descriptor.m_InputToInputWeights))
     , m_InputToForgetWeightsTensor    (AssignScopedTensorHandle(descriptor.m_InputToForgetWeights))
     , m_InputToCellWeightsTensor      (AssignScopedTensorHandle(descriptor.m_InputToCellWeights))
diff --git a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.hpp b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.hpp
index d0c000f..7a91cee 100644
--- a/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.hpp
+++ b/src/backends/reference/workloads/RefUnidirectionalSequenceLstmWorkload.hpp
@@ -7,7 +7,7 @@
 
 #include <armnn/TypesUtils.hpp>
 
-#include <armnn/backends/Workload.hpp>
+#include "RefBaseWorkload.hpp"
 #include <armnn/backends/WorkloadData.hpp>
 
 #include "Encoders.hpp"
@@ -16,7 +16,7 @@
 namespace armnn
 {
 
-class RefUnidirectionalSequenceLstmWorkload : public BaseWorkload<UnidirectionalSequenceLstmQueueDescriptor>
+class RefUnidirectionalSequenceLstmWorkload : public RefBaseWorkload<UnidirectionalSequenceLstmQueueDescriptor>
 {
 public:
     explicit RefUnidirectionalSequenceLstmWorkload(const UnidirectionalSequenceLstmQueueDescriptor& descriptor,