IVGCVSW-5787 Add/Update Execute() implementations in RefActivationWorkload

 * Added multithreaded StridedSliceEndToEndTest

Signed-off-by: Finn Williams <Finn.Williams@arm.com>
Change-Id: I4579db7b5959e0a22256f1bda00238c22e611dec
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index 521854b..0839c1c 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -1341,6 +1341,11 @@
 {
     armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
 }
+
+BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedMultiThreadedEndToEndTest)
+{
+    armnn::experimental::StridedSlicedMultiThreadedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
+}
 #endif
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/Concatenate.cpp b/src/backends/reference/workloads/Concatenate.cpp
index a85e34e..a0e0abf 100644
--- a/src/backends/reference/workloads/Concatenate.cpp
+++ b/src/backends/reference/workloads/Concatenate.cpp
@@ -11,11 +11,13 @@
 namespace armnn
 {
 
-void Concatenate(const ConcatQueueDescriptor &data)
+void Concatenate(const ConcatQueueDescriptor &data,
+                 std::vector<ITensorHandle*> inputs,
+                 std::vector<ITensorHandle*> outputs)
 {
-    const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
+    const TensorInfo& outputInfo0 = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo0, data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo0, outputs[0]->Map());
     Encoder<float>& encoder = *encoderPtr;
 
     for (unsigned int index = 0 ; index < outputInfo0.GetNumElements(); ++index)
@@ -37,7 +39,7 @@
             ConcatQueueDescriptor::ViewOrigin const& view = data.m_ViewOrigins[viewIdx];
 
             //Split view extents are defined by the size of (the corresponding) input tensor.
-            const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[viewIdx]);
+            const TensorInfo& inputInfo = GetTensorInfo(inputs[viewIdx]);
             ARMNN_ASSERT(inputInfo.GetNumDimensions() == outputInfo0.GetNumDimensions());
 
             // Check all dimensions to see if this element is inside the given input view.
@@ -57,7 +59,7 @@
             if (insideView)
             {
                 std::unique_ptr<Decoder<float>> decoderPtr =
-                    MakeDecoder<float>(inputInfo, data.m_Inputs[viewIdx]->Map());
+                    MakeDecoder<float>(inputInfo,inputs[viewIdx]->Map());
                 Decoder<float>& decoder = *decoderPtr;
                 unsigned int inIndex = 0;
                 unsigned int dimensionStride = 1;
diff --git a/src/backends/reference/workloads/Concatenate.hpp b/src/backends/reference/workloads/Concatenate.hpp
index 75e5f8c..e0264b0 100644
--- a/src/backends/reference/workloads/Concatenate.hpp
+++ b/src/backends/reference/workloads/Concatenate.hpp
@@ -10,5 +10,7 @@
 
 namespace armnn
 {
-void Concatenate(const ConcatQueueDescriptor &data);
+void Concatenate(const ConcatQueueDescriptor &data,
+                 std::vector<ITensorHandle*> inputs,
+                 std::vector<ITensorHandle*> outputs);
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefActivationWorkload.cpp b/src/backends/reference/workloads/RefActivationWorkload.cpp
index a26a639..7795867 100644
--- a/src/backends/reference/workloads/RefActivationWorkload.cpp
+++ b/src/backends/reference/workloads/RefActivationWorkload.cpp
@@ -17,17 +17,28 @@
 
 void RefActivationWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefActivationWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefActivationWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefActivationWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    Activation(*MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map()),
-               *MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map()),
+    Activation(*MakeDecoder<float>(inputInfo, inputs[0]->Map()),
+               *MakeEncoder<float>(outputInfo, outputs[0]->Map()),
                inputInfo,
                m_Data.m_Parameters.m_Function,
                m_Data.m_Parameters.m_A,
                m_Data.m_Parameters.m_B);
 }
 
+
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefActivationWorkload.hpp b/src/backends/reference/workloads/RefActivationWorkload.hpp
index 5b2377e..429fb60 100644
--- a/src/backends/reference/workloads/RefActivationWorkload.hpp
+++ b/src/backends/reference/workloads/RefActivationWorkload.hpp
@@ -15,7 +15,11 @@
 {
 public:
     using BaseWorkload<ActivationQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
index bf8649f..77167a8 100644
--- a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
+++ b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
@@ -18,16 +18,27 @@
         const WorkloadInfo& info)
         : BaseWorkload<ArgMinMaxQueueDescriptor>(descriptor, info) {}
 
+
 void RefArgMinMaxWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefArgMinMaxWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefArgMinMaxWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefArgMinMaxWorkload_Execute");
 
-    const TensorInfo &inputTensorInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo &inputTensorInfo = GetTensorInfo(inputs[0]);
 
-    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputTensorInfo, m_Data.m_Inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputTensorInfo, inputs[0]->Map());
     Decoder<float> &decoder = *decoderPtr;
 
-    const TensorInfo &outputTensorInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo &outputTensorInfo = GetTensorInfo(outputs[0]);
 
     if (outputTensorInfo.GetDataType() == armnn::DataType::Signed32) {
         int32_t *output = GetOutputTensorData<int32_t>(0, m_Data);
diff --git a/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp b/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
index 97b7077..df9ebca 100644
--- a/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefArgMinMaxWorkload.hpp
@@ -16,6 +16,10 @@
     explicit RefArgMinMaxWorkload(const ArgMinMaxQueueDescriptor& descriptor,
                                   const WorkloadInfo& info);
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 } //namespace armnn
\ No newline at end of file
diff --git a/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp b/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
index 21fcdab..e106889 100644
--- a/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefBatchNormalizationWorkload.cpp
@@ -24,6 +24,17 @@
 
 void RefBatchNormalizationWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefBatchNormalizationWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefBatchNormalizationWorkload::Execute(std::vector<ITensorHandle*> inputs,
+                                            std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefBatchNormalizationWorkload_Execute");
 
     std::unique_ptr<Decoder<float>> meanDecoder     = MakeDecoder<float>(m_Mean->GetTensorInfo(),
@@ -34,10 +45,10 @@
                                                                          m_Gamma->Map(true));
     std::unique_ptr<Decoder<float>> betaDecoder     = MakeDecoder<float>(m_Beta->GetTensorInfo(),
                                                                          m_Beta->Map(true));
-    std::unique_ptr<Decoder<float>> inputDecoder    = MakeDecoder<float>(GetTensorInfo(m_Data.m_Inputs[0]),
-                                                                         m_Data.m_Inputs[0]->Map());
-    std::unique_ptr<Encoder<float>> outputEncoder   = MakeEncoder<float>(GetTensorInfo(m_Data.m_Outputs[0]),
-                                                                         m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<float>> inputDecoder    = MakeDecoder<float>(GetTensorInfo(inputs[0]),
+                                                                         inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder   = MakeEncoder<float>(GetTensorInfo(outputs[0]),
+                                                                         outputs[0]->Map());
 
     BatchNormImpl(m_Data, *meanDecoder, *varianceDecoder, *betaDecoder, *gammaDecoder, *inputDecoder, *outputEncoder);
 }
diff --git a/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp b/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
index 53d01f6..a8a72ef 100644
--- a/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefBatchNormalizationWorkload.hpp
@@ -16,9 +16,11 @@
 public:
     explicit RefBatchNormalizationWorkload(const BatchNormalizationQueueDescriptor& descriptor,
                                            const WorkloadInfo& info);
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     std::unique_ptr<ScopedCpuTensorHandle> m_Mean;
     std::unique_ptr<ScopedCpuTensorHandle> m_Variance;
     std::unique_ptr<ScopedCpuTensorHandle> m_Beta;
diff --git a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
index c21ef76..441d2ba 100644
--- a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
+++ b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
@@ -13,13 +13,23 @@
 
 void RefBatchToSpaceNdWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefBatchToSpaceNdWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefBatchToSpaceNdWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefBatchToSpaceNdWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Decoder<float>> inputDecoder  = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
-    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<float>> inputDecoder  = MakeDecoder<float>(inputInfo, inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     BatchToSpaceNd(m_Data.m_Parameters.m_DataLayout, inputInfo, outputInfo, m_Data.m_Parameters.m_BlockShape,
                    m_Data.m_Parameters.m_Crops, *inputDecoder, *outputEncoder);
diff --git a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
index 60577ba..07c800d 100644
--- a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
+++ b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
@@ -16,7 +16,11 @@
 public:
     using BaseWorkload<BatchToSpaceNdQueueDescriptor>::BaseWorkload;
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
\ No newline at end of file
diff --git a/src/backends/reference/workloads/RefComparisonWorkload.cpp b/src/backends/reference/workloads/RefComparisonWorkload.cpp
index 52ad9a2..03df7a4 100644
--- a/src/backends/reference/workloads/RefComparisonWorkload.cpp
+++ b/src/backends/reference/workloads/RefComparisonWorkload.cpp
@@ -26,9 +26,15 @@
 
 void RefComparisonWorkload::PostAllocationConfigure()
 {
-    const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& inputInfo1 = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    PostAllocationConfigure(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefComparisonWorkload::PostAllocationConfigure(std::vector<ITensorHandle*> inputs,
+                                                    std::vector<ITensorHandle*> outputs)
+{
+    const TensorInfo& inputInfo0 = GetTensorInfo(inputs[0]);
+    const TensorInfo& inputInfo1 = GetTensorInfo(inputs[1]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     m_Input0 = MakeDecoder<InType>(inputInfo0);
     m_Input1 = MakeDecoder<InType>(inputInfo1);
@@ -38,19 +44,31 @@
 
 void RefComparisonWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefComparisonWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    PostAllocationConfigure(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefComparisonWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefComparisonWorkload_Execute");
 
-    const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& inputInfo1 = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo0 = GetTensorInfo(inputs[0]);
+    const TensorInfo& inputInfo1 = GetTensorInfo(inputs[1]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     const TensorShape& inShape0 = inputInfo0.GetShape();
     const TensorShape& inShape1 = inputInfo1.GetShape();
     const TensorShape& outShape = outputInfo.GetShape();
 
-    m_Input0->Reset(m_Data.m_Inputs[0]->Map());
-    m_Input1->Reset(m_Data.m_Inputs[1]->Map());
-    m_Output->Reset(m_Data.m_Outputs[0]->Map());
+    m_Input0->Reset(inputs[0]->Map());
+    m_Input1->Reset(inputs[1]->Map());
+    m_Output->Reset(outputs[0]->Map());
 
     using EqualFunction          = ElementwiseBinaryFunction<std::equal_to<InType>>;
     using GreaterFunction        = ElementwiseBinaryFunction<std::greater<InType>>;
diff --git a/src/backends/reference/workloads/RefComparisonWorkload.hpp b/src/backends/reference/workloads/RefComparisonWorkload.hpp
index a19e4a0..de0144c 100644
--- a/src/backends/reference/workloads/RefComparisonWorkload.hpp
+++ b/src/backends/reference/workloads/RefComparisonWorkload.hpp
@@ -21,8 +21,11 @@
     RefComparisonWorkload(const ComparisonQueueDescriptor& descriptor, const WorkloadInfo& info);
     void PostAllocationConfigure() override;
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void PostAllocationConfigure(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs);
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     using InType  = float;
     using OutType = bool;
 
diff --git a/src/backends/reference/workloads/RefConcatWorkload.cpp b/src/backends/reference/workloads/RefConcatWorkload.cpp
index e606649..c04c053 100644
--- a/src/backends/reference/workloads/RefConcatWorkload.cpp
+++ b/src/backends/reference/workloads/RefConcatWorkload.cpp
@@ -14,8 +14,18 @@
 
 void RefConcatWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefConcatWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefConcatWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConcatWorkload_Execute");
-    Concatenate(m_Data);
+    Concatenate(m_Data, inputs, outputs);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConcatWorkload.hpp b/src/backends/reference/workloads/RefConcatWorkload.hpp
index 0be28bb..f4e1aa8 100644
--- a/src/backends/reference/workloads/RefConcatWorkload.hpp
+++ b/src/backends/reference/workloads/RefConcatWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<ConcatQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConstantWorkload.cpp b/src/backends/reference/workloads/RefConstantWorkload.cpp
index d3e65e6..6290237 100644
--- a/src/backends/reference/workloads/RefConstantWorkload.cpp
+++ b/src/backends/reference/workloads/RefConstantWorkload.cpp
@@ -20,21 +20,20 @@
     const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info)
     : BaseWorkload<ConstantQueueDescriptor>(descriptor, info) {}
 
-void RefConstantWorkload::PostAllocationConfigure()
-{
-    const ConstantQueueDescriptor& data = this->m_Data;
-
-    ARMNN_ASSERT(data.m_LayerOutput != nullptr);
-
-    const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]);
-    ARMNN_ASSERT(data.m_LayerOutput->GetTensorInfo().GetNumBytes() == outputInfo.GetNumBytes());
-
-    memcpy(GetOutputTensorData<void>(0, data), data.m_LayerOutput->GetConstTensor<void>(),
-        outputInfo.GetNumBytes());
-}
-
 void RefConstantWorkload::Execute() const
 {
+    Execute(m_Data.m_Outputs);
+}
+
+void RefConstantWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Outputs);
+}
+
+void RefConstantWorkload::Execute(std::vector<ITensorHandle*> outputs) const
+{
+    memcpy(outputs[0]->Map(), m_Data.m_LayerOutput->GetConstTensor<void>(), GetTensorInfo(outputs[0]).GetNumBytes());
+
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConstantWorkload_Execute");
 }
 
diff --git a/src/backends/reference/workloads/RefConstantWorkload.hpp b/src/backends/reference/workloads/RefConstantWorkload.hpp
index ada488a..9af5903 100644
--- a/src/backends/reference/workloads/RefConstantWorkload.hpp
+++ b/src/backends/reference/workloads/RefConstantWorkload.hpp
@@ -19,8 +19,10 @@
 public:
     RefConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info);
 
-    void PostAllocationConfigure() override;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.cpp b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.cpp
index c4b5416..70e377d 100644
--- a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.cpp
+++ b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.cpp
@@ -15,12 +15,23 @@
 
 void RefConvertBf16ToFp32Workload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefConvertBf16ToFp32Workload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefConvertBf16ToFp32Workload::Execute(std::vector<ITensorHandle*> inputs,
+                                           std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertBf16ToFp32Workload_Execute");
 
-    const BFloat16* const input = GetInputTensorDataBFloat16(0, m_Data);
-    float* const output = GetOutputTensorDataFloat(0, m_Data);
+    const BFloat16* const input = reinterpret_cast<const BFloat16*>(inputs[0]->Map());
+    float* const output = reinterpret_cast<float*>(outputs[0]->Map());
 
-    unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+    unsigned int numElements = GetTensorInfo(inputs[0]).GetNumElements();
     armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(input, numElements, output);
 }
 
diff --git a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
index 87cdc3e..9061362 100644
--- a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>::BFloat16ToFloat32Workload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.cpp b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.cpp
index ef813eb..347132d 100644
--- a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.cpp
+++ b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.cpp
@@ -15,12 +15,23 @@
 
 void RefConvertFp16ToFp32Workload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefConvertFp16ToFp32Workload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefConvertFp16ToFp32Workload::Execute(std::vector<ITensorHandle*> inputs,
+                                           std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp16ToFp32Workload_Execute");
 
-    const Half* const input = GetInputTensorDataHalf(0, m_Data);
-    float* const output = GetOutputTensorDataFloat(0, m_Data);
+    const Half* const input = reinterpret_cast<const Half*>(inputs[0]->Map());
+    float* const output = reinterpret_cast<float*>(outputs[0]->Map());
 
-    unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+    unsigned int numElements = GetTensorInfo(inputs[0]).GetNumElements();
     armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output);
 }
 
diff --git a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
index 7c58e9f..99ab9e9 100644
--- a/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp16ToFp32Workload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>::Float16ToFloat32Workload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.cpp b/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.cpp
index 181b236..7fe302a 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.cpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.cpp
@@ -15,12 +15,23 @@
 
 void RefConvertFp32ToBf16Workload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefConvertFp32ToBf16Workload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefConvertFp32ToBf16Workload::Execute(std::vector<ITensorHandle*> inputs,
+                                           std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp32ToBf16Workload_Execute");
 
-    const float* const input = GetInputTensorDataFloat(0, m_Data);
-    BFloat16* const output = GetOutputTensorDataBFloat16(0, m_Data);
+    const float* const input = reinterpret_cast<const float*>(inputs[0]->Map());
+    BFloat16* const output = reinterpret_cast<BFloat16*>(outputs[0]->Map());
 
-    unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+    unsigned int numElements = GetTensorInfo(inputs[0]).GetNumElements();
     armnnUtils::FloatingPointConverter::ConvertFloat32ToBFloat16(input, numElements, output);
 }
 
diff --git a/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp b/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
index 409603b..694032c 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToBf16Workload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using Float32ToBFloat16Workload<ConvertFp32ToBf16QueueDescriptor>::Float32ToBFloat16Workload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.cpp b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.cpp
index c68960f..be13458 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.cpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.cpp
@@ -16,13 +16,24 @@
 
 void RefConvertFp32ToFp16Workload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefConvertFp32ToFp16Workload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefConvertFp32ToFp16Workload::Execute(std::vector<ITensorHandle*> inputs,
+                                           std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp32ToFp16Workload_Execute");
 
-    const float* const input = GetInputTensorDataFloat(0, m_Data);
-    Half*  const output = GetOutputTensorDataHalf(0, m_Data);
+    const float* const input = reinterpret_cast<const float*>(inputs[0]->Map());
+    Half*  const output = reinterpret_cast<Half*>(outputs[0]->Map());
 
     // convert Fp32 input to Fp16 output
-    unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+    unsigned int numElements = GetTensorInfo(inputs[0]).GetNumElements();
     armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output);
 }
 
diff --git a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
index e1fd875..f1daa54 100644
--- a/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
+++ b/src/backends/reference/workloads/RefConvertFp32ToFp16Workload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>::Float32ToFloat16Workload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
index dad9936..6d0ab41 100644
--- a/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
@@ -30,24 +30,26 @@
     }
 }
 
-void RefConvolution2dWorkload::PostAllocationConfigure()
+void RefConvolution2dWorkload::Execute() const
 {
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    m_InputShape = inputInfo.GetShape();
-    m_InputDecoder = MakeDecoder<float>(inputInfo);
-
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-    m_OutputShape = outputInfo.GetShape();
-    m_OutputEncoder = MakeEncoder<float>(outputInfo);
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
 }
 
-void RefConvolution2dWorkload::Execute() const {
+void RefConvolution2dWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefConvolution2dWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvolution2dWorkload_Execute");
 
-    m_InputDecoder->Reset(m_Data.m_Inputs[0]->Map());
-    m_OutputEncoder->Reset(m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(GetTensorInfo(inputs[0]), inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]), outputs[0]->Map());
 
-    Convolve(m_InputShape, *m_InputDecoder, m_OutputShape, *m_OutputEncoder, m_FilterShape,
+    const TensorShape& inputShape = GetTensorInfo(inputs[0]).GetShape();
+    const TensorShape& outputShape = GetTensorInfo(outputs[0]).GetShape();
+
+    Convolve(inputShape, *inputDecoder, outputShape, *outputEncoder, m_FilterShape,
              *m_FilterDecoder, m_Data.m_Parameters.m_BiasEnabled, m_BiasDecoder.get(),
              m_Data.m_Parameters.m_DataLayout, m_Data.m_Parameters.m_PadTop, m_Data.m_Parameters.m_PadLeft,
              m_Data.m_Parameters.m_StrideX, m_Data.m_Parameters.m_StrideY,
diff --git a/src/backends/reference/workloads/RefConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
index b6bdf23..57df3ce6 100644
--- a/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefConvolution2dWorkload.hpp
@@ -19,21 +19,18 @@
     explicit RefConvolution2dWorkload(const Convolution2dQueueDescriptor& descriptor,
                                       const WorkloadInfo& info);
 
-    void PostAllocationConfigure() override;
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     std::unique_ptr<ScopedCpuTensorHandle> m_Weight;
     std::unique_ptr<ScopedCpuTensorHandle> m_Bias;
 
-    std::unique_ptr<Decoder<float>> m_InputDecoder;
-    std::unique_ptr<Encoder<float>> m_OutputEncoder;
     std::unique_ptr<Decoder<float>> m_FilterDecoder;
     std::unique_ptr<Decoder<float>> m_BiasDecoder;
 
-    TensorShape m_InputShape;
-    TensorShape m_OutputShape;
     TensorShape m_FilterShape;
 };
 
diff --git a/src/backends/reference/workloads/RefDebugWorkload.cpp b/src/backends/reference/workloads/RefDebugWorkload.cpp
index f9950c8..b0e19c5 100644
--- a/src/backends/reference/workloads/RefDebugWorkload.cpp
+++ b/src/backends/reference/workloads/RefDebugWorkload.cpp
@@ -17,18 +17,30 @@
 template<armnn::DataType DataType>
 void RefDebugWorkload<DataType>::Execute() const
 {
+    Execute(m_Data.m_Inputs);
+}
+
+template<armnn::DataType DataType>
+void RefDebugWorkload<DataType>::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs);
+}
+
+template<armnn::DataType DataType>
+void RefDebugWorkload<DataType>::Execute(std::vector<ITensorHandle*> inputs) const
+{
     using T = ResolveType<DataType>;
 
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, GetName() + "_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
 
     const T* inputData = GetInputTensorData<T>(0, m_Data);
     T* outputData = GetOutputTensorData<T>(0, m_Data);
 
     if (m_Callback)
     {
-        m_Callback(m_Data.m_Guid, m_Data.m_SlotIndex, m_Data.m_Inputs[0]);
+        m_Callback(m_Data.m_Guid, m_Data.m_SlotIndex, inputs[0]);
     }
     else
     {
diff --git a/src/backends/reference/workloads/RefDebugWorkload.hpp b/src/backends/reference/workloads/RefDebugWorkload.hpp
index d7e3cd9..d0c47dd 100644
--- a/src/backends/reference/workloads/RefDebugWorkload.hpp
+++ b/src/backends/reference/workloads/RefDebugWorkload.hpp
@@ -30,10 +30,12 @@
     using TypedWorkload<DebugQueueDescriptor, DataType>::TypedWorkload;
 
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
     void RegisterDebugCallback(const DebugCallbackFunction& func) override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs) const;
     DebugCallbackFunction m_Callback;
 };
 
diff --git a/src/backends/reference/workloads/RefDepthToSpaceWorkload.cpp b/src/backends/reference/workloads/RefDepthToSpaceWorkload.cpp
index 93c1120..22e35f0 100644
--- a/src/backends/reference/workloads/RefDepthToSpaceWorkload.cpp
+++ b/src/backends/reference/workloads/RefDepthToSpaceWorkload.cpp
@@ -13,14 +13,24 @@
 
 void RefDepthToSpaceWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefDepthToSpaceWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefDepthToSpaceWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthToSpaceWorkload_Execute");
 
-    const TensorInfo inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo inputInfo = GetTensorInfo(inputs[0]);
 
     DepthToSpace(inputInfo,
                  m_Data.m_Parameters,
-                 m_Data.m_Inputs[0]->Map(),
-                 m_Data.m_Outputs[0]->Map(),
+                 inputs[0]->Map(),
+                 outputs[0]->Map(),
                  GetDataTypeSize(inputInfo.GetDataType()));
 }
 
diff --git a/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp b/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
index a30fadc..ec260a9 100644
--- a/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
+++ b/src/backends/reference/workloads/RefDepthToSpaceWorkload.hpp
@@ -14,7 +14,10 @@
 {
 public:
     using BaseWorkload<DepthToSpaceQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
index cfc81ce..8fe5dec 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
@@ -32,26 +32,29 @@
     }
 }
 
-void RefDepthwiseConvolution2dWorkload::PostAllocationConfigure()
+void RefDepthwiseConvolution2dWorkload::Execute() const
 {
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    m_InputShape = inputInfo.GetShape();
-    m_InputDecoder = MakeDecoder<float>(inputInfo);
-
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-    m_OutputShape = outputInfo.GetShape();
-    m_OutputEncoder = MakeEncoder<float>(outputInfo);
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
 }
 
-void RefDepthwiseConvolution2dWorkload::Execute() const
+void RefDepthwiseConvolution2dWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefDepthwiseConvolution2dWorkload::Execute(std::vector<ITensorHandle*> inputs,
+                                                std::vector<ITensorHandle*> outputs) const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthwiseConvolution2dWorkload_Execute");
     std::unique_ptr<Decoder<float>> pBiasDecoder{};
 
-    m_InputDecoder->Reset(m_Data.m_Inputs[0]->Map());
-    m_OutputEncoder->Reset(m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(GetTensorInfo(inputs[0]), inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> OutputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]), outputs[0]->Map());
 
-    Convolve(m_InputShape, *m_InputDecoder, m_OutputShape, *m_OutputEncoder,
+    const TensorShape& inputShape = GetTensorInfo(inputs[0]).GetShape();
+    const TensorShape& outputShape = GetTensorInfo(outputs[0]).GetShape();
+
+    Convolve(inputShape, *inputDecoder, outputShape, *OutputEncoder,
              m_FilterShape, *m_FilterDecoder, m_Data.m_Parameters.m_BiasEnabled, m_BiasDecoder.get(),
              m_Data.m_Parameters.m_DataLayout, m_Data.m_Parameters.m_PadTop, m_Data.m_Parameters.m_PadLeft,
              m_Data.m_Parameters.m_StrideX, m_Data.m_Parameters.m_StrideY,
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
index 6d7037f..65a8fd7 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.hpp
@@ -17,22 +17,19 @@
     explicit RefDepthwiseConvolution2dWorkload(const DepthwiseConvolution2dQueueDescriptor &descriptor,
                                                const WorkloadInfo &info);
 
-    void PostAllocationConfigure() override;
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 
     std::unique_ptr <ScopedCpuTensorHandle> m_Weight;
     std::unique_ptr <ScopedCpuTensorHandle> m_Bias;
 
-    std::unique_ptr <Decoder<float>> m_InputDecoder;
-    std::unique_ptr <Encoder<float>> m_OutputEncoder;
     std::unique_ptr <Decoder<float>> m_FilterDecoder;
     std::unique_ptr <Decoder<float>> m_BiasDecoder;
 
-    TensorShape m_InputShape;
-    TensorShape m_OutputShape;
     TensorShape m_FilterShape;
 };
 
diff --git a/src/backends/reference/workloads/RefDequantizeWorkload.cpp b/src/backends/reference/workloads/RefDequantizeWorkload.cpp
index d6e4964..f9d8007 100644
--- a/src/backends/reference/workloads/RefDequantizeWorkload.cpp
+++ b/src/backends/reference/workloads/RefDequantizeWorkload.cpp
@@ -14,13 +14,23 @@
 
 void RefDequantizeWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefDequantizeWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefDequantizeWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDequantizeWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    auto inputDecoder  = MakeDecoder<float>(inputInfo,  m_Data.m_Inputs[0]->Map());
-    auto outputEncoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    auto inputDecoder  = MakeDecoder<float>(inputInfo,  inputs[0]->Map());
+    auto outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     Dequantize(*inputDecoder, *outputEncoder, inputInfo, outputInfo);
 }
diff --git a/src/backends/reference/workloads/RefDequantizeWorkload.hpp b/src/backends/reference/workloads/RefDequantizeWorkload.hpp
index 691f713..922d57c 100644
--- a/src/backends/reference/workloads/RefDequantizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefDequantizeWorkload.hpp
@@ -17,6 +17,9 @@
     using BaseWorkload<DequantizeQueueDescriptor>::BaseWorkload;
 
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
index b9817ba..25c326a 100644
--- a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
@@ -20,19 +20,30 @@
 
 void RefDetectionPostProcessWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefDetectionPostProcessWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefDetectionPostProcessWorkload::Execute(std::vector<ITensorHandle*> inputs,
+                                              std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDetectionPostProcessWorkload_Execute");
 
-    const TensorInfo& boxEncodingsInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& scoresInfo       = GetTensorInfo(m_Data.m_Inputs[1]);
+    const TensorInfo& boxEncodingsInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& scoresInfo       = GetTensorInfo(inputs[1]);
     const TensorInfo& anchorsInfo      = m_Anchors->GetTensorInfo();
 
-    const TensorInfo& detectionBoxesInfo   = GetTensorInfo(m_Data.m_Outputs[0]);
-    const TensorInfo& detectionClassesInfo = GetTensorInfo(m_Data.m_Outputs[1]);
-    const TensorInfo& detectionScoresInfo  = GetTensorInfo(m_Data.m_Outputs[2]);
-    const TensorInfo& numDetectionsInfo    = GetTensorInfo(m_Data.m_Outputs[3]);
+    const TensorInfo& detectionBoxesInfo   = GetTensorInfo(outputs[0]);
+    const TensorInfo& detectionClassesInfo = GetTensorInfo(outputs[1]);
+    const TensorInfo& detectionScoresInfo  = GetTensorInfo(outputs[2]);
+    const TensorInfo& numDetectionsInfo    = GetTensorInfo(outputs[3]);
 
-    auto boxEncodings = MakeDecoder<float>(boxEncodingsInfo, m_Data.m_Inputs[0]->Map());
-    auto scores       = MakeDecoder<float>(scoresInfo, m_Data.m_Inputs[1]->Map());
+    auto boxEncodings = MakeDecoder<float>(boxEncodingsInfo, inputs[0]->Map());
+    auto scores       = MakeDecoder<float>(scoresInfo, inputs[1]->Map());
     auto anchors      = MakeDecoder<float>(anchorsInfo, m_Anchors->Map(false));
 
     float* detectionBoxes   = GetOutputTensorData<float>(0, m_Data);
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
index 799d0c6..007dcea 100644
--- a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.hpp
@@ -16,9 +16,11 @@
 public:
     explicit RefDetectionPostProcessWorkload(const DetectionPostProcessQueueDescriptor& descriptor,
                                              const WorkloadInfo& info);
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     std::unique_ptr<ScopedCpuTensorHandle> m_Anchors;
 };
 
diff --git a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
index 4fbb0d1..b442f25 100644
--- a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.cpp
@@ -28,28 +28,29 @@
     : BaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
 {}
 
-void RefElementwiseUnaryWorkload::PostAllocationConfigure()
+void RefElementwiseUnaryWorkload::Execute() const
 {
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-
-    m_Input = MakeDecoder<InType>(inputInfo);
-
-    m_Output = MakeEncoder<OutType>(outputInfo);
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
 }
 
-void RefElementwiseUnaryWorkload::Execute() const
+void RefElementwiseUnaryWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefElementwiseUnaryWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefElementwiseUnaryWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     const TensorShape& inShape = inputInfo.GetShape();
     const TensorShape& outShape = outputInfo.GetShape();
 
-    m_Input->Reset(m_Data.m_Inputs[0]->Map());
-    m_Output->Reset(m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<InType>>  input = MakeDecoder<InType>(inputInfo, inputs[0]->Map());
+    std::unique_ptr<Encoder<OutType>> output= MakeEncoder<OutType>(outputInfo, outputs[0]->Map());
 
     using AbsFunction   = ElementwiseUnaryFunction<abs<InType>>;
     using ExpFunction   = ElementwiseUnaryFunction<exp<InType>>;
@@ -61,27 +62,27 @@
     {
         case UnaryOperation::Abs:
         {
-            AbsFunction(inShape, outShape, *m_Input, *m_Output);
+            AbsFunction(inShape, outShape, *input, *output);
             break;
         }
         case UnaryOperation::Exp:
         {
-            ExpFunction(inShape, outShape, *m_Input, *m_Output);
+            ExpFunction(inShape, outShape, *input, *output);
             break;
         }
         case UnaryOperation::Neg:
         {
-            NegFunction(inShape, outShape, *m_Input, *m_Output);
+            NegFunction(inShape, outShape, *input, *output);
             break;
         }
         case UnaryOperation::Rsqrt:
         {
-            RsqrtFunction(inShape, outShape, *m_Input, *m_Output);
+            RsqrtFunction(inShape, outShape, *input, *output);
             break;
         }
         case UnaryOperation::Sqrt:
         {
-            SqrtFunction(inShape, outShape, *m_Input, *m_Output);
+            SqrtFunction(inShape, outShape, *input, *output);
             break;
         }
         default:
diff --git a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
index efb2865..d05347b 100644
--- a/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefElementwiseUnaryWorkload.hpp
@@ -19,15 +19,13 @@
     using BaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
 
     RefElementwiseUnaryWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
-    void PostAllocationConfigure() override;
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     using InType  = float;
     using OutType = float;
-
-    std::unique_ptr<Decoder<InType>>  m_Input;
-    std::unique_ptr<Encoder<OutType>> m_Output;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefElementwiseWorkload.cpp b/src/backends/reference/workloads/RefElementwiseWorkload.cpp
index 60acbd6..dd7d325 100644
--- a/src/backends/reference/workloads/RefElementwiseWorkload.cpp
+++ b/src/backends/reference/workloads/RefElementwiseWorkload.cpp
@@ -26,39 +26,41 @@
 }
 
 template <typename Functor, typename ParentDescriptor, typename armnn::StringMapping::Id DebugString>
-void RefElementwiseWorkload<Functor, ParentDescriptor, DebugString>::PostAllocationConfigure()
+void RefElementwiseWorkload<Functor, ParentDescriptor, DebugString>::Execute() const
 {
-    const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& inputInfo1 = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-
-    m_Input0 = MakeDecoder<InType>(inputInfo0);
-    m_Input1 = MakeDecoder<InType>(inputInfo1);
-    m_Output = MakeEncoder<OutType>(outputInfo);
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
 }
 
 template <typename Functor, typename ParentDescriptor, typename armnn::StringMapping::Id DebugString>
-void RefElementwiseWorkload<Functor, ParentDescriptor, DebugString>::Execute() const
+void RefElementwiseWorkload<Functor, ParentDescriptor, DebugString>::ExecuteAsync(
+        WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+template <typename Functor, typename ParentDescriptor, typename armnn::StringMapping::Id DebugString>
+void RefElementwiseWorkload<Functor, ParentDescriptor, DebugString>::Execute(
+        std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, StringMapping::Instance().Get(DebugString));
-    const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& inputInfo1 = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo0 = GetTensorInfo(inputs[0]);
+    const TensorInfo& inputInfo1 = GetTensorInfo(inputs[1]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     const TensorShape& inShape0 = inputInfo0.GetShape();
     const TensorShape& inShape1 = inputInfo1.GetShape();
     const TensorShape& outShape = outputInfo.GetShape();
 
-    m_Input0->Reset(m_Data.m_Inputs[0]->Map());
-    m_Input1->Reset(m_Data.m_Inputs[1]->Map());
-    m_Output->Reset(m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<InType>> input0 = MakeDecoder<InType>(inputInfo0, inputs[0]->Map());
+    std::unique_ptr<Decoder<InType>> input1 = MakeDecoder<InType>(inputInfo1, inputs[1]->Map());
+    std::unique_ptr<Encoder<OutType>> output= MakeEncoder<OutType>(outputInfo, outputs[0]->Map());
 
     ElementwiseBinaryFunction<Functor>(inShape0,
                                        inShape1,
                                        outShape,
-                                       *m_Input0,
-                                       *m_Input1,
-                                       *m_Output);
+                                       *input0,
+                                       *input1,
+                                       *output);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefElementwiseWorkload.hpp b/src/backends/reference/workloads/RefElementwiseWorkload.hpp
index 03683b1..4dc4b5b 100644
--- a/src/backends/reference/workloads/RefElementwiseWorkload.hpp
+++ b/src/backends/reference/workloads/RefElementwiseWorkload.hpp
@@ -26,13 +26,11 @@
     using BaseWorkload<ParentDescriptor>::m_Data;
 
     RefElementwiseWorkload(const ParentDescriptor& descriptor, const WorkloadInfo& info);
-    void PostAllocationConfigure() override;
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
-    std::unique_ptr<Decoder<InType>> m_Input0;
-    std::unique_ptr<Decoder<InType>> m_Input1;
-    std::unique_ptr<Encoder<OutType>> m_Output;
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 template <typename DataType = float>
diff --git a/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.cpp b/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.cpp
index cf355d3..b30811b 100644
--- a/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.cpp
@@ -28,12 +28,23 @@
 
 void RefFakeQuantizationFloat32Workload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefFakeQuantizationFloat32Workload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefFakeQuantizationFloat32Workload::Execute(std::vector<ITensorHandle*> inputs,
+                                                 std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefFakeQuantizationFloat32Workload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
 
-    const float* inputData = GetInputTensorDataFloat(0, m_Data);
-    float* outputData = GetOutputTensorDataFloat(0, m_Data);
+    const float* inputData = reinterpret_cast<const float*>(inputs[0]->Map());
+    float* outputData = reinterpret_cast<float*>(outputs[0]->Map());
     FakeQuantization(inputData, outputData, inputInfo.GetNumElements(),
                      m_Data.m_Parameters.m_Min,
                      m_Data.m_Parameters.m_Max);
diff --git a/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp b/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
index 269ca08..8f6cabb 100644
--- a/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
+++ b/src/backends/reference/workloads/RefFakeQuantizationFloat32Workload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using Float32Workload<FakeQuantizationQueueDescriptor>::Float32Workload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefFillWorkload.cpp b/src/backends/reference/workloads/RefFillWorkload.cpp
index 991ab45..ea1ca87 100644
--- a/src/backends/reference/workloads/RefFillWorkload.cpp
+++ b/src/backends/reference/workloads/RefFillWorkload.cpp
@@ -16,11 +16,21 @@
 
 void RefFillWorkload::Execute() const
 {
+    Execute(m_Data.m_Outputs);
+}
+
+void RefFillWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Outputs);
+}
+
+void RefFillWorkload::Execute(std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefFillWorkload_Execute");
 
-    const TensorInfo &outputTensorInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo &outputTensorInfo = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputTensorInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputTensorInfo, outputs[0]->Map());
     Encoder<float> &encoder = *encoderPtr;
 
     Fill(encoder, outputTensorInfo.GetShape(), m_Data.m_Parameters.m_Value);
diff --git a/src/backends/reference/workloads/RefFillWorkload.hpp b/src/backends/reference/workloads/RefFillWorkload.hpp
index 9be773c..e92514d 100644
--- a/src/backends/reference/workloads/RefFillWorkload.hpp
+++ b/src/backends/reference/workloads/RefFillWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<FillQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefFloorWorkload.cpp b/src/backends/reference/workloads/RefFloorWorkload.cpp
index 0c61386..e7bd50d 100644
--- a/src/backends/reference/workloads/RefFloorWorkload.cpp
+++ b/src/backends/reference/workloads/RefFloorWorkload.cpp
@@ -15,17 +15,27 @@
 
 void RefFloorWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefFloorWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefFloorWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefFloorFloat32Workload_Execute");
 
-    const TensorInfo &inputTensorInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputTensorInfo, m_Data.m_Inputs[0]->Map());
+    const TensorInfo &inputTensorInfo = GetTensorInfo(inputs[0]);
+    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputTensorInfo, inputs[0]->Map());
     Decoder<float> &decoder = *decoderPtr;
 
-    const TensorInfo &outputTensorInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputTensorInfo, m_Data.m_Outputs[0]->Map());
+    const TensorInfo &outputTensorInfo = GetTensorInfo(outputs[0]);
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputTensorInfo, outputs[0]->Map());
     Encoder<float> &encoder = *encoderPtr;
 
-    unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+    unsigned int numElements = GetTensorInfo(inputs[0]).GetNumElements();
 
     for (unsigned int i = 0; i < numElements; ++i)
     {
diff --git a/src/backends/reference/workloads/RefFloorWorkload.hpp b/src/backends/reference/workloads/RefFloorWorkload.hpp
index 5636402..28b2695 100644
--- a/src/backends/reference/workloads/RefFloorWorkload.hpp
+++ b/src/backends/reference/workloads/RefFloorWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<FloorQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
index 49e105f..deb56d4 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
@@ -34,28 +34,32 @@
 
 void RefFullyConnectedWorkload::PostAllocationConfigure()
 {
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    PostAllocationConfigure(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefFullyConnectedWorkload::PostAllocationConfigure(std::vector<ITensorHandle*> inputs,
+                                                        std::vector<ITensorHandle*> outputs)
+{
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
     ARMNN_ASSERT(inputInfo.GetNumDimensions() > 1);
     m_InputShape = inputInfo.GetShape();
-    m_InputDecoder = MakeDecoder<float>(inputInfo);
 
     if (!m_Data.m_Parameters.m_ConstantWeights)
     {
-        const TensorInfo& rWeightInfo = GetTensorInfo(m_Data.m_Inputs[1]);
+        const TensorInfo& rWeightInfo = GetTensorInfo(inputs[1]);
         ARMNN_ASSERT(inputInfo.GetNumDimensions() > 1);
         m_WeightShape = rWeightInfo.GetShape();
         m_WeightDecoder = MakeDecoder<float>(rWeightInfo);
 
         if (m_Data.m_Parameters.m_BiasEnabled)
         {
-            const TensorInfo& biasInfo = GetTensorInfo(m_Data.m_Inputs[2]);
+            const TensorInfo& biasInfo = GetTensorInfo(inputs[2]);
             m_BiasDecoder = MakeDecoder<float>(biasInfo);
         }
     }
 
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
     m_OutputShape = outputInfo.GetShape();
-    m_OutputEncoder = MakeEncoder<float>(outputInfo);
 
     m_NumActivations = 1; // Total number of activations in the input.
     for (unsigned int i = 1; i < inputInfo.GetNumDimensions(); i++)
@@ -66,23 +70,36 @@
 
 void RefFullyConnectedWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefFullyConnectedWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    PostAllocationConfigure(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefFullyConnectedWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefFullyConnectedWorkload_Execute");
 
-    m_InputDecoder->Reset(m_Data.m_Inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(GetTensorInfo(inputs[0]), inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> OutputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]), outputs[0]->Map());
+
     if (!m_Data.m_Parameters.m_ConstantWeights)
     {
-        m_WeightDecoder->Reset(m_Data.m_Inputs[1]->Map());
+        m_WeightDecoder->Reset(inputs[1]->Map());
         if (m_Data.m_Parameters.m_BiasEnabled)
         {
-            m_BiasDecoder->Reset(m_Data.m_Inputs[2]->Map());
+            m_BiasDecoder->Reset(inputs[2]->Map());
         }
     }
-    m_OutputEncoder->Reset(m_Data.m_Outputs[0]->Map());
 
     FullyConnected(m_InputShape,
-                   *m_InputDecoder,
+                   *inputDecoder,
                    m_OutputShape,
-                   *m_OutputEncoder,
+                   *OutputEncoder,
                    m_WeightShape,
                    *m_WeightDecoder,
                    *m_BiasDecoder,
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
index a8f0756..5c0f67e 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.hpp
@@ -23,14 +23,15 @@
 
     void PostAllocationConfigure() override;
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void PostAllocationConfigure(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs);
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     std::unique_ptr<ScopedCpuTensorHandle> m_Weight;
     std::unique_ptr<ScopedCpuTensorHandle> m_Bias;
 
-    std::unique_ptr<Decoder<float>> m_InputDecoder;
-    std::unique_ptr<Encoder<float>> m_OutputEncoder;
     std::unique_ptr<Decoder<float>> m_WeightDecoder;
     std::unique_ptr<Decoder<float>> m_BiasDecoder;
 
diff --git a/src/backends/reference/workloads/RefGatherWorkload.cpp b/src/backends/reference/workloads/RefGatherWorkload.cpp
index eaeed61..020c067 100644
--- a/src/backends/reference/workloads/RefGatherWorkload.cpp
+++ b/src/backends/reference/workloads/RefGatherWorkload.cpp
@@ -15,18 +15,28 @@
 
 void RefGatherWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefGatherWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefGatherWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefGatherWorkload_Execute");
 
-    const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& inputInfo1 = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo0 = GetTensorInfo(inputs[0]);
+    const TensorInfo& inputInfo1 = GetTensorInfo(inputs[1]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo0, m_Data.m_Inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo0, inputs[0]->Map());
     Decoder<float>& decoder = *decoderPtr;
 
     const int32_t* indicesData = GetInputTensorData<int32_t>(1, m_Data);
 
-    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, outputs[0]->Map());
     Encoder<float>& encoder = *encoderPtr;
 
     Gather(inputInfo0, inputInfo1, outputInfo, decoder, indicesData, encoder, m_Data.m_Parameters.m_Axis);
diff --git a/src/backends/reference/workloads/RefGatherWorkload.hpp b/src/backends/reference/workloads/RefGatherWorkload.hpp
index 30019a8..1664e16 100644
--- a/src/backends/reference/workloads/RefGatherWorkload.hpp
+++ b/src/backends/reference/workloads/RefGatherWorkload.hpp
@@ -21,6 +21,9 @@
 public:
     using BaseWorkload<GatherQueueDescriptor>::BaseWorkload;
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
index 150f0cb..daee97a 100644
--- a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
@@ -20,12 +20,23 @@
 
 void RefInstanceNormalizationWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefInstanceNormalizationWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefInstanceNormalizationWorkload::Execute(std::vector<ITensorHandle*> inputs,
+                                               std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefInstanceNormalizationWorkload_Execute");
 
-    std::unique_ptr<Decoder<float>> inputDecoder  = MakeDecoder<float>(GetTensorInfo(m_Data.m_Inputs[0]),
-                                                                       m_Data.m_Inputs[0]->Map());
-    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(m_Data.m_Outputs[0]),
-                                                                       m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<float>> inputDecoder  = MakeDecoder<float>(GetTensorInfo(inputs[0]),
+                                                                       inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]),
+                                                                       outputs[0]->Map());
 
     InstanceNorm(m_Data, *inputDecoder, *outputEncoder);
 }
diff --git a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
index 620779f..e366ddb 100644
--- a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.hpp
@@ -16,7 +16,10 @@
 public:
     explicit RefInstanceNormalizationWorkload(const InstanceNormalizationQueueDescriptor& descriptor,
                                               const WorkloadInfo& info);
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp b/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
index f80901e..ca31503 100644
--- a/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefL2NormalizationWorkload.cpp
@@ -26,13 +26,23 @@
 
 void RefL2NormalizationWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefL2NormalizationWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefL2NormalizationWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefL2NormalizationWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    auto inputDecoder  = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
-    auto outputEncoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    auto inputDecoder  = MakeDecoder<float>(inputInfo, inputs[0]->Map());
+    auto outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     DataLayoutIndexed dataLayout(m_Data.m_Parameters.m_DataLayout);
 
diff --git a/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp b/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
index 4beedc9..c17767b 100644
--- a/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefL2NormalizationWorkload.hpp
@@ -18,6 +18,9 @@
                                         const WorkloadInfo& info);
 
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefLogSoftmaxWorkload.cpp b/src/backends/reference/workloads/RefLogSoftmaxWorkload.cpp
index a2ace13..ebe1b1e 100644
--- a/src/backends/reference/workloads/RefLogSoftmaxWorkload.cpp
+++ b/src/backends/reference/workloads/RefLogSoftmaxWorkload.cpp
@@ -19,13 +19,23 @@
 
 void RefLogSoftmaxWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefLogSoftmaxWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefLogSoftmaxWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefLogSoftmaxWorkload_Execute");
 
-    const TensorInfo& inputInfo  = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo  = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Decoder<float>> decoder = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
-    std::unique_ptr<Encoder<float>> encoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<float>> decoder = MakeDecoder<float>(inputInfo, inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> encoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     ARMNN_ASSERT(decoder != nullptr);
     ARMNN_ASSERT(encoder != nullptr);
diff --git a/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp b/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
index f5048d9..c5d5d5b 100644
--- a/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogSoftmaxWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<LogSoftmaxQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp b/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
index 1b4e8f9..f187e0c 100644
--- a/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefLogicalBinaryWorkload.cpp
@@ -22,32 +22,31 @@
     : BaseWorkload<LogicalBinaryQueueDescriptor>(desc, info)
 {}
 
-void RefLogicalBinaryWorkload::PostAllocationConfigure()
+void RefLogicalBinaryWorkload::Execute() const
 {
-    const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& inputInfo1 = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-
-    m_Input0 = MakeDecoder<InType>(inputInfo0);
-    m_Input1 = MakeDecoder<InType>(inputInfo1);
-    m_Output = MakeEncoder<OutType>(outputInfo);
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
 }
 
-void RefLogicalBinaryWorkload::Execute() const
+void RefLogicalBinaryWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefLogicalBinaryWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefLogicalBinaryWorkload_Execute");
 
-    const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& inputInfo1 = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo0 = GetTensorInfo(inputs[0]);
+    const TensorInfo& inputInfo1 = GetTensorInfo(inputs[1]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     const TensorShape& inShape0 = inputInfo0.GetShape();
     const TensorShape& inShape1 = inputInfo1.GetShape();
     const TensorShape& outShape = outputInfo.GetShape();
 
-    m_Input0->Reset(m_Data.m_Inputs[0]->Map());
-    m_Input1->Reset(m_Data.m_Inputs[1]->Map());
-    m_Output->Reset(m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<InType>>  input0 = MakeDecoder<InType>(inputInfo0, inputs[0]->Map());
+    std::unique_ptr<Decoder<InType>>  input1 = MakeDecoder<InType>(inputInfo1, inputs[1]->Map());
+    std::unique_ptr<Encoder<OutType>> output = MakeEncoder<OutType>(outputInfo, outputs[0]->Map());
 
     using AndFunction = LogicalBinaryFunction<std::logical_and<bool>>;
     using OrFunction  = LogicalBinaryFunction<std::logical_or<bool>>;
@@ -56,12 +55,12 @@
     {
         case LogicalBinaryOperation::LogicalAnd:
         {
-            AndFunction(inShape0, inShape1, outShape, *m_Input0, *m_Input1, *m_Output);
+            AndFunction(inShape0, inShape1, outShape, *input0, *input1, *output);
             break;
         }
         case LogicalBinaryOperation::LogicalOr:
         {
-            OrFunction(inShape0, inShape1, outShape, *m_Input0, *m_Input1, *m_Output);
+            OrFunction(inShape0, inShape1, outShape, *input0, *input1, *output);
             break;
         }
         default:
diff --git a/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp b/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
index 4d6baf5..d79a303 100644
--- a/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogicalBinaryWorkload.hpp
@@ -19,16 +19,13 @@
     using BaseWorkload<LogicalBinaryQueueDescriptor>::m_Data;
 
     RefLogicalBinaryWorkload(const LogicalBinaryQueueDescriptor& descriptor, const WorkloadInfo& info);
-    void PostAllocationConfigure() override;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     using InType  = bool;
     using OutType = bool;
-
-    std::unique_ptr<Decoder<InType>>  m_Input0;
-    std::unique_ptr<Decoder<InType>>  m_Input1;
-    std::unique_ptr<Encoder<OutType>> m_Output;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp b/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
index 76eb5ac..bef2bdc 100644
--- a/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
+++ b/src/backends/reference/workloads/RefLogicalUnaryWorkload.cpp
@@ -22,27 +22,28 @@
     : BaseWorkload<ElementwiseUnaryQueueDescriptor>(desc, info)
 {}
 
-void RefLogicalUnaryWorkload::PostAllocationConfigure()
+void RefLogicalUnaryWorkload::Execute() const
 {
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-
-    m_Input = MakeDecoder<InType>(inputInfo);
-    m_Output = MakeEncoder<OutType>(outputInfo);
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
 }
 
-void RefLogicalUnaryWorkload::Execute() const
+void RefLogicalUnaryWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefLogicalUnaryWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefLogicalUnaryWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     const TensorShape& inShape = inputInfo.GetShape();
     const TensorShape& outShape = outputInfo.GetShape();
 
-    m_Input->Reset(m_Data.m_Inputs[0]->Map());
-    m_Output->Reset(m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<InType>>  input = MakeDecoder<InType>(inputInfo, inputs[0]->Map());
+    std::unique_ptr<Encoder<OutType>> output = MakeEncoder<OutType>(outputInfo, outputs[0]->Map());
 
     using NotFunction = LogicalUnaryFunction<std::logical_not<bool>>;
 
@@ -50,7 +51,7 @@
     {
         case UnaryOperation::LogicalNot:
         {
-            NotFunction(inShape, outShape, *m_Input, *m_Output);
+            NotFunction(inShape, outShape, *input, *output);
             break;
         }
         default:
diff --git a/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp b/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
index 0d8b354..117f168 100644
--- a/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
+++ b/src/backends/reference/workloads/RefLogicalUnaryWorkload.hpp
@@ -19,15 +19,13 @@
     using BaseWorkload<ElementwiseUnaryQueueDescriptor>::m_Data;
 
     RefLogicalUnaryWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
-    void PostAllocationConfigure() override;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     using InType  = bool;
     using OutType = bool;
-
-    std::unique_ptr<Decoder<InType>>  m_Input;
-    std::unique_ptr<Encoder<OutType>> m_Output;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefLstmWorkload.cpp b/src/backends/reference/workloads/RefLstmWorkload.cpp
index 7c37301..0942354 100644
--- a/src/backends/reference/workloads/RefLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefLstmWorkload.cpp
@@ -40,25 +40,35 @@
 
 void RefLstmWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefLstmWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefLstmWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     // This is a porting of the LSTM::Eval() method in the Android code base
     // Refer to: android/frameworks/ml/nn/common/operations/LSTM.cpp
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     const TensorShape& inputShape = inputInfo.GetShape();
     const DataType& outputType = outputInfo.GetDataType();
 
-    std::unique_ptr<Encoder<float>> outputStateOut = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[1]->Map());
-    std::unique_ptr<Encoder<float>> cellStateOut   = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[2]->Map());
-    std::unique_ptr<Encoder<float>> output         = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[3]->Map());
+    std::unique_ptr<Encoder<float>> outputStateOut = MakeEncoder<float>(outputInfo, outputs[1]->Map());
+    std::unique_ptr<Encoder<float>> cellStateOut   = MakeEncoder<float>(outputInfo, outputs[2]->Map());
+    std::unique_ptr<Encoder<float>> output         = MakeEncoder<float>(outputInfo, outputs[3]->Map());
 
-    std::unique_ptr<Decoder<float>> cellStateOutDecoder = MakeDecoder<float>(outputInfo, m_Data.m_Outputs[2]->Map());
-    std::unique_ptr<Decoder<float>> outputDecoder       = MakeDecoder<float>(outputInfo, m_Data.m_Outputs[3]->Map());
+    std::unique_ptr<Decoder<float>> cellStateOutDecoder = MakeDecoder<float>(outputInfo, outputs[2]->Map());
+    std::unique_ptr<Decoder<float>> outputDecoder       = MakeDecoder<float>(outputInfo, outputs[3]->Map());
 
-    std::unique_ptr<Decoder<float>> inputData     = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
-    std::unique_ptr<Decoder<float>> outputStateIn = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[1]->Map());
-    std::unique_ptr<Decoder<float>> cellStateIn   = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[2]->Map());
+    std::unique_ptr<Decoder<float>> inputData     = MakeDecoder<float>(inputInfo, inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> outputStateIn = MakeDecoder<float>(inputInfo, inputs[1]->Map());
+    std::unique_ptr<Decoder<float>> cellStateIn   = MakeDecoder<float>(inputInfo, inputs[2]->Map());
 
     const uint32_t nBatch = inputShape[0];
     const uint32_t nInput = inputShape[1];
@@ -71,19 +81,19 @@
     const bool useLayerNorm = m_Data.m_Parameters.m_LayerNormEnabled;
 
     // Index the scratch buffers pointers to the global scratch buffer.
-    std::unique_ptr<Encoder<float>> inputGateScratch  = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
-    std::unique_ptr<Encoder<float>> cellScratch       = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
-    std::unique_ptr<Encoder<float>> forgetGateScratch = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
-    std::unique_ptr<Encoder<float>> outputGateScratch = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> inputGateScratch  = MakeEncoder<float>(outputInfo, outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> cellScratch       = MakeEncoder<float>(outputInfo, outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> forgetGateScratch = MakeEncoder<float>(outputInfo, outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputGateScratch = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     std::unique_ptr<Decoder<float>> inputGateScratchDecoder =
-        MakeDecoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+        MakeDecoder<float>(outputInfo, outputs[0]->Map());
     std::unique_ptr<Decoder<float>> cellScratchDecoder =
-        MakeDecoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+        MakeDecoder<float>(outputInfo, outputs[0]->Map());
     std::unique_ptr<Decoder<float>> forgetGateScratchDecoder =
-        MakeDecoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+        MakeDecoder<float>(outputInfo, outputs[0]->Map());
     std::unique_ptr<Decoder<float>> outputGateScratchDecoder =
-        MakeDecoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+        MakeDecoder<float>(outputInfo, outputs[0]->Map());
 
     if (useCifg)
     {
diff --git a/src/backends/reference/workloads/RefLstmWorkload.hpp b/src/backends/reference/workloads/RefLstmWorkload.hpp
index ce5a775..b55a1f9 100644
--- a/src/backends/reference/workloads/RefLstmWorkload.hpp
+++ b/src/backends/reference/workloads/RefLstmWorkload.hpp
@@ -18,9 +18,11 @@
 public:
     explicit RefLstmWorkload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     std::unique_ptr<ScopedCpuTensorHandle> m_InputToInputWeightsTensor;
     std::unique_ptr<ScopedCpuTensorHandle> m_InputToForgetWeightsTensor;
     std::unique_ptr<ScopedCpuTensorHandle> m_InputToCellWeightsTensor;
diff --git a/src/backends/reference/workloads/RefMeanWorkload.cpp b/src/backends/reference/workloads/RefMeanWorkload.cpp
index 00e59bc..7941ce2 100644
--- a/src/backends/reference/workloads/RefMeanWorkload.cpp
+++ b/src/backends/reference/workloads/RefMeanWorkload.cpp
@@ -20,13 +20,23 @@
 
 void RefMeanWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefMeanWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefMeanWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefMeanWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    auto inputDecoder  = MakeDecoder<float>(inputInfo,  m_Data.m_Inputs[0]->Map());
-    auto outputEncoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    auto inputDecoder  = MakeDecoder<float>(inputInfo,  inputs[0]->Map());
+    auto outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     Reduce(inputInfo,
            outputInfo,
diff --git a/src/backends/reference/workloads/RefMeanWorkload.hpp b/src/backends/reference/workloads/RefMeanWorkload.hpp
index c673f94..b5a9ed8 100644
--- a/src/backends/reference/workloads/RefMeanWorkload.hpp
+++ b/src/backends/reference/workloads/RefMeanWorkload.hpp
@@ -18,7 +18,10 @@
 {
 public:
     explicit RefMeanWorkload (const MeanQueueDescriptor& descriptor, const WorkloadInfo& info);
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefNormalizationWorkload.cpp b/src/backends/reference/workloads/RefNormalizationWorkload.cpp
index d5d2104..36828ac 100644
--- a/src/backends/reference/workloads/RefNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefNormalizationWorkload.cpp
@@ -163,12 +163,22 @@
 
 void RefNormalizationWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefNormalizationWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefNormalizationWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefNormalizationWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
 
-    auto inputDecoder  = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
-    auto outputEncoder = MakeEncoder<float>(inputInfo, m_Data.m_Outputs[0]->Map());
+    auto inputDecoder  = MakeDecoder<float>(inputInfo, inputs[0]->Map());
+    auto outputEncoder = MakeEncoder<float>(inputInfo, outputs[0]->Map());
 
     if (NormalizationAlgorithmMethod::LocalBrightness == m_Data.m_Parameters.m_NormMethodType)
     {
diff --git a/src/backends/reference/workloads/RefNormalizationWorkload.hpp b/src/backends/reference/workloads/RefNormalizationWorkload.hpp
index 9d68ffd..59170b8 100644
--- a/src/backends/reference/workloads/RefNormalizationWorkload.hpp
+++ b/src/backends/reference/workloads/RefNormalizationWorkload.hpp
@@ -17,7 +17,10 @@
     explicit RefNormalizationWorkload(const NormalizationQueueDescriptor& descriptor,
                                       const WorkloadInfo& info);
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefPadWorkload.cpp b/src/backends/reference/workloads/RefPadWorkload.cpp
index af22c31..ea515ca 100644
--- a/src/backends/reference/workloads/RefPadWorkload.cpp
+++ b/src/backends/reference/workloads/RefPadWorkload.cpp
@@ -14,10 +14,20 @@
 
 void RefPadWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefPadWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefPadWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefPadWorkload_Execute");
 
-    const TensorInfo& inputInfo  = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo  = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     armnn::Pad(inputInfo,
                outputInfo,
diff --git a/src/backends/reference/workloads/RefPadWorkload.hpp b/src/backends/reference/workloads/RefPadWorkload.hpp
index 0b8379a..afc6203 100644
--- a/src/backends/reference/workloads/RefPadWorkload.hpp
+++ b/src/backends/reference/workloads/RefPadWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<PadQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefPermuteWorkload.cpp b/src/backends/reference/workloads/RefPermuteWorkload.cpp
index 1fb1421..f6af208 100644
--- a/src/backends/reference/workloads/RefPermuteWorkload.cpp
+++ b/src/backends/reference/workloads/RefPermuteWorkload.cpp
@@ -16,12 +16,25 @@
 template <armnn::DataType DataType>
 void RefPermuteWorkload<DataType>::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+template <armnn::DataType DataType>
+void RefPermuteWorkload<DataType>::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+template <armnn::DataType DataType>
+void RefPermuteWorkload<DataType>::Execute(std::vector<ITensorHandle*> inputs,
+                                           std::vector<ITensorHandle*> outputs) const
+{
     using T = ResolveType<DataType>;
 
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, GetName() + "_Execute");
 
-    const ITensorHandle*     src      = m_Data.m_Inputs[0];
-    ITensorHandle*           dst      = m_Data.m_Outputs[0];
+    const ITensorHandle*     src      = inputs[0];
+    ITensorHandle*           dst      = outputs[0];
     const PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
 
     armnnUtils::Permute(GetTensorInfo(dst).GetShape(), mappings,
diff --git a/src/backends/reference/workloads/RefPermuteWorkload.hpp b/src/backends/reference/workloads/RefPermuteWorkload.hpp
index 62a1456..94f6334 100644
--- a/src/backends/reference/workloads/RefPermuteWorkload.hpp
+++ b/src/backends/reference/workloads/RefPermuteWorkload.hpp
@@ -25,6 +25,9 @@
     using TypedWorkload<PermuteQueueDescriptor, DataType>::m_Data;
     using TypedWorkload<PermuteQueueDescriptor, DataType>::TypedWorkload;
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 using RefPermuteBFloat16Workload = RefPermuteWorkload<DataType::BFloat16>;
diff --git a/src/backends/reference/workloads/RefPooling2dWorkload.cpp b/src/backends/reference/workloads/RefPooling2dWorkload.cpp
index 40b8147..d337278 100644
--- a/src/backends/reference/workloads/RefPooling2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefPooling2dWorkload.cpp
@@ -15,13 +15,23 @@
 {
 void RefPooling2dWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefPooling2dWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefPooling2dWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefPooling2dWorkload_Execute");
 
-    const TensorInfo& inputInfo  = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo  = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    auto inputDecoder  = MakeDecoder<float>(inputInfo,  m_Data.m_Inputs[0] ->Map());
-    auto outputEncoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    auto inputDecoder  = MakeDecoder<float>(inputInfo,  inputs[0] ->Map());
+    auto outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     Pooling2d(*inputDecoder,
               *outputEncoder,
diff --git a/src/backends/reference/workloads/RefPooling2dWorkload.hpp b/src/backends/reference/workloads/RefPooling2dWorkload.hpp
index 24386b7..3495d6b 100644
--- a/src/backends/reference/workloads/RefPooling2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefPooling2dWorkload.hpp
@@ -18,6 +18,9 @@
 public:
     using BaseWorkload<Pooling2dQueueDescriptor>::BaseWorkload;
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefPreluWorkload.cpp b/src/backends/reference/workloads/RefPreluWorkload.cpp
index cdc0a63..b298874 100644
--- a/src/backends/reference/workloads/RefPreluWorkload.cpp
+++ b/src/backends/reference/workloads/RefPreluWorkload.cpp
@@ -20,14 +20,24 @@
 
 void RefPreluWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefPreluWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefPreluWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefPreluWorkload_Execute");
 
-    std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(GetTensorInfo(m_Data.m_Inputs[0]),
-                                                                      m_Data.m_Inputs[0]->Map());
-    std::unique_ptr<Decoder<float>> alphaDecoder = MakeDecoder<float>(GetTensorInfo(m_Data.m_Inputs[1]),
-                                                                      m_Data.m_Inputs[1]->Map());
-    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(m_Data.m_Outputs[0]),
-                                                                       m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(GetTensorInfo(inputs[0]),
+                                                                      inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> alphaDecoder = MakeDecoder<float>(GetTensorInfo(inputs[1]),
+                                                                      inputs[1]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]),
+                                                                       outputs[0]->Map());
 
     PreluImpl(m_Data, *inputDecoder, *alphaDecoder, *outputEncoder);
 }
diff --git a/src/backends/reference/workloads/RefPreluWorkload.hpp b/src/backends/reference/workloads/RefPreluWorkload.hpp
index 72839e6..4fe5704 100644
--- a/src/backends/reference/workloads/RefPreluWorkload.hpp
+++ b/src/backends/reference/workloads/RefPreluWorkload.hpp
@@ -16,7 +16,10 @@
 public:
     explicit RefPreluWorkload(const PreluQueueDescriptor& descriptor,
                               const WorkloadInfo& info);
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefQLstmWorkload.cpp b/src/backends/reference/workloads/RefQLstmWorkload.cpp
index bcd6a62..7b7961c 100644
--- a/src/backends/reference/workloads/RefQLstmWorkload.cpp
+++ b/src/backends/reference/workloads/RefQLstmWorkload.cpp
@@ -45,19 +45,30 @@
 
 void RefQLstmWorkload::Execute() const
 {
-    // This is a porting of the QLSTM::Execute() method in the Android code base
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefQLstmWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefQLstmWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
+    // This is a porting of the QLSTM::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs)
+    // method in the Android code base
     // Note: this implementation wraps the arithmetic functions of the LSTM cell in Quantize/Dequantize ops, so all
     // computation is done in the floating point domain. Arithmetic functions are found in LstmUtils.cpp.
     // Refer to: android/frameworks/ml/nn/common/operations/QLSTM.cpp
     const DataType& internalType = armnn::DataType::QSymmS16;
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputStateInInfo = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& cellStateInInfo = GetTensorInfo(m_Data.m_Inputs[2]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputStateInInfo = GetTensorInfo(inputs[1]);
+    const TensorInfo& cellStateInInfo = GetTensorInfo(inputs[2]);
 
-    const TensorInfo& outputStateOutInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-    const TensorInfo& cellStateOutInfo = GetTensorInfo(m_Data.m_Outputs[1]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[2]);
+    const TensorInfo& outputStateOutInfo = GetTensorInfo(outputs[0]);
+    const TensorInfo& cellStateOutInfo = GetTensorInfo(outputs[1]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[2]);
 
     const TensorShape& inputShape = inputInfo.GetShape();
     const TensorShape& outputStateInShape = outputStateInInfo.GetShape();
@@ -77,27 +88,27 @@
 
     // Input decoders
     std::unique_ptr<Decoder<float>> inputDecoder =
-            MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
+            MakeDecoder<float>(inputInfo, inputs[0]->Map());
     std::unique_ptr<Decoder<float>> outputStateInDecoder =
-            MakeDecoder<float>(outputStateInInfo, m_Data.m_Inputs[1]->Map());
+            MakeDecoder<float>(outputStateInInfo, inputs[1]->Map());
     std::unique_ptr<Decoder<float>> cellStateInDecoder =
-            MakeDecoder<float>(cellStateInInfo, m_Data.m_Inputs[2]->Map());
+            MakeDecoder<float>(cellStateInInfo, inputs[2]->Map());
 
     // Output decoders
     std::unique_ptr<Decoder<float>> outputStateOutDecoder =
-            MakeDecoder<float>(outputStateOutInfo, m_Data.m_Outputs[0]->Map());
+            MakeDecoder<float>(outputStateOutInfo, outputs[0]->Map());
     std::unique_ptr<Decoder<float>> cellStateOutDecoder =
-            MakeDecoder<float>(cellStateOutInfo, m_Data.m_Outputs[1]->Map());
+            MakeDecoder<float>(cellStateOutInfo, outputs[1]->Map());
     std::unique_ptr<Decoder<float>> outputDecoder =
-            MakeDecoder<float>(outputInfo, m_Data.m_Outputs[2]->Map());
+            MakeDecoder<float>(outputInfo, outputs[2]->Map());
 
     // Output encoders
     std::unique_ptr<Encoder<float>> outputStateOutEncoder =
-            MakeEncoder<float>(outputStateOutInfo, m_Data.m_Outputs[0]->Map());
+            MakeEncoder<float>(outputStateOutInfo, outputs[0]->Map());
     std::unique_ptr<Encoder<float>> cellStateOutEncoder =
-            MakeEncoder<float>(cellStateOutInfo, m_Data.m_Outputs[1]->Map());
+            MakeEncoder<float>(cellStateOutInfo, outputs[1]->Map());
     std::unique_ptr<Encoder<float>> outputEncoder =
-            MakeEncoder<float>(outputInfo, m_Data.m_Outputs[2]->Map());
+            MakeEncoder<float>(outputInfo, outputs[2]->Map());
 
     // Weights decoders
     std::unique_ptr<Decoder<float>> inputToForgetWeightsDecoder = MakeDecoder<float>(
diff --git a/src/backends/reference/workloads/RefQLstmWorkload.hpp b/src/backends/reference/workloads/RefQLstmWorkload.hpp
index 19d3a2a..f4242ec 100644
--- a/src/backends/reference/workloads/RefQLstmWorkload.hpp
+++ b/src/backends/reference/workloads/RefQLstmWorkload.hpp
@@ -18,9 +18,11 @@
 public:
     explicit RefQLstmWorkload(const QLstmQueueDescriptor& descriptor, const WorkloadInfo& info);
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     std::unique_ptr<ScopedCpuTensorHandle> m_InputToInputWeightsTensor;
     std::unique_ptr<ScopedCpuTensorHandle> m_InputToForgetWeightsTensor;
     std::unique_ptr<ScopedCpuTensorHandle> m_InputToCellWeightsTensor;
diff --git a/src/backends/reference/workloads/RefQuantizeWorkload.cpp b/src/backends/reference/workloads/RefQuantizeWorkload.cpp
index 2eef5f3..35791e6 100644
--- a/src/backends/reference/workloads/RefQuantizeWorkload.cpp
+++ b/src/backends/reference/workloads/RefQuantizeWorkload.cpp
@@ -34,21 +34,22 @@
 {
 }
 
-void RefQuantizeWorkload::PostAllocationConfigure()
-{
-    const TensorInfo& inputInfo = armnn::GetTensorInfo(m_Data.m_Inputs[0]);
-    m_InputDecoder = MakeDecoder<float>(inputInfo);
-
-    const TensorInfo& outputInfo = armnn::GetTensorInfo(m_Data.m_Outputs[0]);
-    m_OutputEncoder = MakeEncoder<float>(outputInfo);
-}
-
 void RefQuantizeWorkload::Execute() const
 {
-    m_InputDecoder->Reset(m_Data.m_Inputs[0]->Map());
-    m_OutputEncoder->Reset(m_Data.m_Outputs[0]->Map());
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
 
-    QuantizeImpl(*m_InputDecoder, *m_OutputEncoder, m_NumElements);
+void RefQuantizeWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefQuantizeWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
+    std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(GetTensorInfo(inputs[0]), inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]), outputs[0]->Map());
+
+    QuantizeImpl(*inputDecoder, *outputEncoder, m_NumElements);
 }
 
 } //namespace armnn
\ No newline at end of file
diff --git a/src/backends/reference/workloads/RefQuantizeWorkload.hpp b/src/backends/reference/workloads/RefQuantizeWorkload.hpp
index 9ae1076..48116e7 100644
--- a/src/backends/reference/workloads/RefQuantizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefQuantizeWorkload.hpp
@@ -16,13 +16,11 @@
 {
 public:
     RefQuantizeWorkload(const QuantizeQueueDescriptor& descriptor, const WorkloadInfo &info);
-    void PostAllocationConfigure() override;
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
-
-    std::unique_ptr<Decoder<float>> m_InputDecoder;
-    std::unique_ptr<Encoder<float>> m_OutputEncoder;
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 
     size_t m_NumElements;
 };
diff --git a/src/backends/reference/workloads/RefRankWorkload.hpp b/src/backends/reference/workloads/RefRankWorkload.hpp
index 660db6b..237ae99 100644
--- a/src/backends/reference/workloads/RefRankWorkload.hpp
+++ b/src/backends/reference/workloads/RefRankWorkload.hpp
@@ -19,10 +19,21 @@
     using BaseWorkload<RankQueueDescriptor>::BaseWorkload;
     virtual void Execute() const override
     {
-        const int32_t rank = static_cast<int32_t>(GetTensorInfo(m_Data.m_Inputs[0]).GetNumDimensions());
+        Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+
+    }
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override
+    {
+        Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+    }
+
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+    {
+        const int32_t rank = static_cast<int32_t>(GetTensorInfo(inputs[0]).GetNumDimensions());
 
         std::memcpy(GetOutputTensorData<void>(0, m_Data), &rank, sizeof(int32_t));
-        m_Data.m_Outputs[0]->Unmap();
+        outputs[0]->Unmap();
     }
 };
 
diff --git a/src/backends/reference/workloads/RefReduceWorkload.cpp b/src/backends/reference/workloads/RefReduceWorkload.cpp
index 7a46ff9..821e828 100644
--- a/src/backends/reference/workloads/RefReduceWorkload.cpp
+++ b/src/backends/reference/workloads/RefReduceWorkload.cpp
@@ -20,15 +20,25 @@
 
 void RefReduceWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefReduceWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefReduceWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefReduceWorkload_Execute");
 
-    const TensorInfo& inputInfo  = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo  = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo, inputs[0]->Map());
     Decoder<float>& decoder = *decoderPtr;
 
-    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, outputs[0]->Map());
     Encoder<float>& encoder = *encoderPtr;
 
     Reduce(inputInfo,
diff --git a/src/backends/reference/workloads/RefReduceWorkload.hpp b/src/backends/reference/workloads/RefReduceWorkload.hpp
index 1d551ac..d45161c 100644
--- a/src/backends/reference/workloads/RefReduceWorkload.hpp
+++ b/src/backends/reference/workloads/RefReduceWorkload.hpp
@@ -17,7 +17,10 @@
     explicit RefReduceWorkload(const ReduceQueueDescriptor& descriptor,
                                const WorkloadInfo& info);
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefReshapeWorkload.cpp b/src/backends/reference/workloads/RefReshapeWorkload.cpp
index 6d29781..960d591 100644
--- a/src/backends/reference/workloads/RefReshapeWorkload.cpp
+++ b/src/backends/reference/workloads/RefReshapeWorkload.cpp
@@ -14,11 +14,21 @@
 
 void RefReshapeWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefReshapeWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefReshapeWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefReshapeWorkload_Execute");
 
-    void* output = GetOutputTensorData<void>(0, m_Data);
-    const void* input = GetInputTensorData<void>(0, m_Data);
-    unsigned int numBytes = GetTensorInfo(m_Data.m_Inputs[0]).GetNumBytes();
+    void* output =  outputs[0]->Map();
+    const void* input =  inputs[0]->Map();
+    unsigned int numBytes = GetTensorInfo(inputs[0]).GetNumBytes();
     memcpy(output, input, numBytes);
 }
 
diff --git a/src/backends/reference/workloads/RefReshapeWorkload.hpp b/src/backends/reference/workloads/RefReshapeWorkload.hpp
index 7359ff9..2b6cf43 100644
--- a/src/backends/reference/workloads/RefReshapeWorkload.hpp
+++ b/src/backends/reference/workloads/RefReshapeWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<ReshapeQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefResizeBilinearWorkload.cpp b/src/backends/reference/workloads/RefResizeBilinearWorkload.cpp
index a23caf9..2cf5888 100644
--- a/src/backends/reference/workloads/RefResizeBilinearWorkload.cpp
+++ b/src/backends/reference/workloads/RefResizeBilinearWorkload.cpp
@@ -19,14 +19,24 @@
 
 void RefResizeBilinearWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefResizeBilinearWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefResizeBilinearWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefResizeBilinearWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo, inputs[0]->Map());
     Decoder<float> &decoder = *decoderPtr;
-    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, outputs[0]->Map());
     Encoder<float> &encoder = *encoderPtr;
 
     Resize(decoder, inputInfo, encoder, outputInfo, m_Data.m_Parameters.m_DataLayout, armnn::ResizeMethod::Bilinear);
diff --git a/src/backends/reference/workloads/RefResizeBilinearWorkload.hpp b/src/backends/reference/workloads/RefResizeBilinearWorkload.hpp
index a0e33fa..5ada3d1 100644
--- a/src/backends/reference/workloads/RefResizeBilinearWorkload.hpp
+++ b/src/backends/reference/workloads/RefResizeBilinearWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<ResizeBilinearQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefResizeWorkload.cpp b/src/backends/reference/workloads/RefResizeWorkload.cpp
index 21ff852..d7a82b8 100644
--- a/src/backends/reference/workloads/RefResizeWorkload.cpp
+++ b/src/backends/reference/workloads/RefResizeWorkload.cpp
@@ -19,14 +19,24 @@
 
 void RefResizeWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefResizeWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefResizeWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefResizeWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo, inputs[0]->Map());
     Decoder<float> &decoder = *decoderPtr;
-    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, outputs[0]->Map());
     Encoder<float> &encoder = *encoderPtr;
 
     Resize(decoder,
diff --git a/src/backends/reference/workloads/RefResizeWorkload.hpp b/src/backends/reference/workloads/RefResizeWorkload.hpp
index e72271a..f58eadc 100644
--- a/src/backends/reference/workloads/RefResizeWorkload.hpp
+++ b/src/backends/reference/workloads/RefResizeWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<ResizeQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefSliceWorkload.cpp b/src/backends/reference/workloads/RefSliceWorkload.cpp
index 2e44845..f94a83e 100644
--- a/src/backends/reference/workloads/RefSliceWorkload.cpp
+++ b/src/backends/reference/workloads/RefSliceWorkload.cpp
@@ -15,14 +15,24 @@
 
 void RefSliceWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefSliceWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefSliceWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefSliceWorkload_Execute");
 
-    const TensorInfo& inputInfo  = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& inputInfo  = GetTensorInfo(inputs[0]);
 
     Slice(inputInfo,
           m_Data.m_Parameters,
-          m_Data.m_Inputs[0]->Map(),
-          m_Data.m_Outputs[0]->Map(),
+          inputs[0]->Map(),
+          outputs[0]->Map(),
           GetDataTypeSize(inputInfo.GetDataType()));
 }
 
diff --git a/src/backends/reference/workloads/RefSliceWorkload.hpp b/src/backends/reference/workloads/RefSliceWorkload.hpp
index 006c7b7..8a1db8e 100644
--- a/src/backends/reference/workloads/RefSliceWorkload.hpp
+++ b/src/backends/reference/workloads/RefSliceWorkload.hpp
@@ -16,7 +16,10 @@
 public:
     using BaseWorkload<SliceQueueDescriptor>::BaseWorkload;
 
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefSoftmaxWorkload.cpp b/src/backends/reference/workloads/RefSoftmaxWorkload.cpp
index 2e4d811..9733cbc 100644
--- a/src/backends/reference/workloads/RefSoftmaxWorkload.cpp
+++ b/src/backends/reference/workloads/RefSoftmaxWorkload.cpp
@@ -19,16 +19,26 @@
 
 void RefSoftmaxWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefSoftmaxWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefSoftmaxWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefSoftmaxWorkload_Execute");
 
-    const TensorInfo &inputTensorInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo &inputTensorInfo = GetTensorInfo(inputs[0]);
 
-    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputTensorInfo, m_Data.m_Inputs[0]->Map());
+    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputTensorInfo, inputs[0]->Map());
     Decoder<float> &decoder = *decoderPtr;
 
-    const TensorInfo &outputTensorInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo &outputTensorInfo = GetTensorInfo(outputs[0]);
 
-    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputTensorInfo, m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputTensorInfo, outputs[0]->Map());
     Encoder<float> &encoder = *encoderPtr;
 
     Softmax(decoder,
diff --git a/src/backends/reference/workloads/RefSoftmaxWorkload.hpp b/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
index 3d00c6f..6e62369 100644
--- a/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
+++ b/src/backends/reference/workloads/RefSoftmaxWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<SoftmaxQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp
index c65d4c1..e35632d 100644
--- a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp
+++ b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp
@@ -14,13 +14,23 @@
 
 void RefSpaceToBatchNdWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefSpaceToBatchNdWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefSpaceToBatchNdWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefSpaceToBatchNdWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    std::unique_ptr<Decoder<float>> decoder = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    std::unique_ptr<Decoder<float>> decoder = MakeDecoder<float>(inputInfo, inputs[0]->Map());
 
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-    std::unique_ptr<Encoder<float>> encoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
+    std::unique_ptr<Encoder<float>> encoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     SpaceToBatchNd(inputInfo, outputInfo, m_Data.m_Parameters, *decoder, *encoder);
 }
diff --git a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
index caf2648..82ddb32 100644
--- a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
+++ b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
@@ -16,6 +16,9 @@
 public:
     using BaseWorkload<SpaceToBatchNdQueueDescriptor>::BaseWorkload;
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefSpaceToDepthWorkload.cpp b/src/backends/reference/workloads/RefSpaceToDepthWorkload.cpp
index 1b12272..88faf7a 100644
--- a/src/backends/reference/workloads/RefSpaceToDepthWorkload.cpp
+++ b/src/backends/reference/workloads/RefSpaceToDepthWorkload.cpp
@@ -14,13 +14,23 @@
 
 void RefSpaceToDepthWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefSpaceToDepthWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefSpaceToDepthWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefSpaceToDepthWorkload_Execute");
 
-    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
-    std::unique_ptr<Decoder<float>> decoder = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    std::unique_ptr<Decoder<float>> decoder = MakeDecoder<float>(inputInfo, inputs[0]->Map());
 
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-    std::unique_ptr<Encoder<float>> encoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
+    std::unique_ptr<Encoder<float>> encoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     SpaceToDepth(inputInfo, outputInfo, m_Data.m_Parameters, *decoder, *encoder);
 }
diff --git a/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp b/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
index 89e5585..d8f44b7 100644
--- a/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
+++ b/src/backends/reference/workloads/RefSpaceToDepthWorkload.hpp
@@ -15,7 +15,10 @@
 {
 public:
     using BaseWorkload<SpaceToDepthQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefSplitterWorkload.cpp b/src/backends/reference/workloads/RefSplitterWorkload.cpp
index 5207423..076aefe 100644
--- a/src/backends/reference/workloads/RefSplitterWorkload.cpp
+++ b/src/backends/reference/workloads/RefSplitterWorkload.cpp
@@ -13,8 +13,18 @@
 
 void RefSplitterWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefSplitterWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefSplitterWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefSplitterWorkload_Execute");
-    Split(m_Data);
+    Split(m_Data, inputs, outputs);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefSplitterWorkload.hpp b/src/backends/reference/workloads/RefSplitterWorkload.hpp
index c491e1e..99b5ff6 100644
--- a/src/backends/reference/workloads/RefSplitterWorkload.hpp
+++ b/src/backends/reference/workloads/RefSplitterWorkload.hpp
@@ -17,7 +17,10 @@
 {
 public:
     using BaseWorkload<SplitterQueueDescriptor>::BaseWorkload;
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefStackWorkload.cpp b/src/backends/reference/workloads/RefStackWorkload.cpp
index fc85950..20cf3b3 100644
--- a/src/backends/reference/workloads/RefStackWorkload.cpp
+++ b/src/backends/reference/workloads/RefStackWorkload.cpp
@@ -20,6 +20,16 @@
 
 void RefStackWorkload::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefStackWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefStackWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefStackWorkload_Execute");
 
     // Can perform a simple concatenation when axis == 0
@@ -29,7 +39,7 @@
         ARMNN_ASSERT(output != nullptr);
 
         unsigned int numInputs = m_Data.m_Parameters.m_NumInputs;
-        unsigned int inputLength = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+        unsigned int inputLength = GetTensorInfo(inputs[0]).GetNumElements();
 
         for (unsigned int inputIdx=0; inputIdx<numInputs; ++inputIdx)
         {
@@ -43,13 +53,13 @@
     }
 
     std::vector<std::unique_ptr<Decoder<float>>> inputDecoders;
-    for (unsigned int i=0; i<m_Data.m_Inputs.size(); ++i)
+    for (unsigned int i=0; i<inputs.size(); ++i)
     {
-        inputDecoders.push_back(MakeDecoder<float>(GetTensorInfo(m_Data.m_Inputs[i]),
-                                                   m_Data.m_Inputs[i]->Map()));
+        inputDecoders.push_back(MakeDecoder<float>(GetTensorInfo(inputs[i]),
+                                                   inputs[i]->Map()));
     }
-    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(m_Data.m_Outputs[0]),
-                                                                       m_Data.m_Outputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]),
+                                                                       outputs[0]->Map());
 
     Stack(m_Data, inputDecoders, *outputEncoder);
 }
diff --git a/src/backends/reference/workloads/RefStackWorkload.hpp b/src/backends/reference/workloads/RefStackWorkload.hpp
index ceb27d9..4276339 100644
--- a/src/backends/reference/workloads/RefStackWorkload.hpp
+++ b/src/backends/reference/workloads/RefStackWorkload.hpp
@@ -16,7 +16,10 @@
 public:
     explicit RefStackWorkload(const StackQueueDescriptor& descriptor,
                               const WorkloadInfo& info);
-    virtual void Execute() const override;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
index ce807ee..336a687 100644
--- a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
+++ b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
@@ -17,30 +17,20 @@
 
 void RefStridedSliceWorkload::Execute() const
 {
-    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefStridedSliceWorkload_Execute");
-
-    const TensorInfo& inputInfo  = GetTensorInfo(m_Data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
-
-    DataType inputDataType  = inputInfo.GetDataType();
-    DataType outputDataType = outputInfo.GetDataType();
-
-    ARMNN_ASSERT(inputDataType == outputDataType);
-    IgnoreUnused(outputDataType);
-
-    StridedSlice(inputInfo,
-                 m_Data.m_Parameters,
-                 m_Data.m_Inputs[0]->Map(),
-                 m_Data.m_Outputs[0]->Map(),
-                 GetDataTypeSize(inputDataType));
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
 }
 
-void RefStridedSliceWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor)
+void RefStridedSliceWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
 {
-    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefStridedSliceWorkload_Execute_WorkingMemDescriptor");
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
 
-    const TensorInfo& inputInfo  = GetTensorInfo(descriptor.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(descriptor.m_Outputs[0]);
+void RefStridedSliceWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefStridedSliceWorkload_Execute");
+
+    const TensorInfo& inputInfo  = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
 
     DataType inputDataType  = inputInfo.GetDataType();
     DataType outputDataType = outputInfo.GetDataType();
@@ -50,8 +40,8 @@
 
     StridedSlice(inputInfo,
                  m_Data.m_Parameters,
-                 descriptor.m_Inputs[0]->Map(),
-                 descriptor.m_Outputs[0]->Map(),
+                 inputs[0]->Map(),
+                 outputs[0]->Map(),
                  GetDataTypeSize(inputDataType));
 }
 
diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
index 3e253ed..38613e2 100644
--- a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
+++ b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
@@ -15,7 +15,9 @@
 public:
     RefStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
-    void ExecuteAsync(WorkingMemDescriptor& descriptor) override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
index 2ab7604..6341228 100644
--- a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
@@ -33,35 +33,32 @@
     }
 }
 
-void RefTransposeConvolution2dWorkload::PostAllocationConfigure()
+void RefTransposeConvolution2dWorkload::Execute() const
 {
-    // set up input decoder
-    const ITensorHandle* input  = m_Data.m_Inputs[0];
-    const TensorInfo& inputInfo = GetTensorInfo(input);
-
-    m_InputShape   = inputInfo.GetShape();
-    m_InputDecoder = MakeDecoder<float>(inputInfo);
-
-    // set up output encoder
-    ITensorHandle* output        = m_Data.m_Outputs[0];
-    const TensorInfo& outputInfo = GetTensorInfo(output);
-
-    m_OutputShape   = outputInfo.GetShape();
-    m_OutputEncoder = MakeEncoder<float>(outputInfo);
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
 }
 
-void RefTransposeConvolution2dWorkload::Execute() const
+void RefTransposeConvolution2dWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefTransposeConvolution2dWorkload::Execute(std::vector<ITensorHandle*> inputs,
+                                                std::vector<ITensorHandle*> outputs) const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefTransposeConvolution2dWorkload_Execute");
 
-    m_InputDecoder->Reset(m_Data.m_Inputs[0]->Map());
-    m_OutputEncoder->Reset(m_Data.m_Outputs[0]->Map());
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
+
+    std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(inputInfo, inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
     TransposeConvolution2dImpl(m_Data.m_Parameters,
-                               m_InputShape,
-                               *m_InputDecoder,
-                               m_OutputShape,
-                               *m_OutputEncoder,
+                               inputInfo.GetShape(),
+                               *inputDecoder,
+                               outputInfo.GetShape(),
+                               *outputEncoder,
                                m_WeightsShape,
                                *m_WeightsDecoder,
                                m_BiasesDecoder.get());
diff --git a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
index 9ded8c9..7c18f10 100644
--- a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
+++ b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.hpp
@@ -21,22 +21,17 @@
                                       const WorkloadInfo& info);
     ~RefTransposeConvolution2dWorkload() = default;
 
-    void PostAllocationConfigure() override;
-
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
 
 private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
     std::unique_ptr<ScopedCpuTensorHandle> m_Weights;
     std::unique_ptr<ScopedCpuTensorHandle> m_Biases;
 
-    std::unique_ptr<Decoder<float>> m_InputDecoder;
-    std::unique_ptr<Encoder<float>> m_OutputEncoder;
-
     std::unique_ptr<Decoder<float>> m_WeightsDecoder;
     std::unique_ptr<Decoder<float>> m_BiasesDecoder;
 
-    TensorShape m_InputShape;
-    TensorShape m_OutputShape;
     TensorShape m_WeightsShape;
 };
 
diff --git a/src/backends/reference/workloads/RefTransposeWorkload.cpp b/src/backends/reference/workloads/RefTransposeWorkload.cpp
index cc7a555..828badd 100644
--- a/src/backends/reference/workloads/RefTransposeWorkload.cpp
+++ b/src/backends/reference/workloads/RefTransposeWorkload.cpp
@@ -16,12 +16,25 @@
 template <armnn::DataType DataType>
 void RefTransposeWorkload<DataType>::Execute() const
 {
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+template <armnn::DataType DataType>
+void RefTransposeWorkload<DataType>::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+template <armnn::DataType DataType>
+void RefTransposeWorkload<DataType>::Execute(std::vector<ITensorHandle*> inputs,
+                                             std::vector<ITensorHandle*> outputs) const
+{
     using T = ResolveType<DataType>;
 
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, GetName() + "_Execute");
 
-    const ITensorHandle*     src      = m_Data.m_Inputs[0];
-    ITensorHandle*           dst      = m_Data.m_Outputs[0];
+    const ITensorHandle*     src      = inputs[0];
+    ITensorHandle*           dst      = outputs[0];
     const PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
 
     armnnUtils::Transpose(GetTensorInfo(src).GetShape(), mappings, src->Map(), dst->Map(), sizeof(T));
diff --git a/src/backends/reference/workloads/RefTransposeWorkload.hpp b/src/backends/reference/workloads/RefTransposeWorkload.hpp
index 1e03f2e..08ba74f 100644
--- a/src/backends/reference/workloads/RefTransposeWorkload.hpp
+++ b/src/backends/reference/workloads/RefTransposeWorkload.hpp
@@ -25,6 +25,9 @@
     using TypedWorkload<TransposeQueueDescriptor, DataType>::m_Data;
     using TypedWorkload<TransposeQueueDescriptor, DataType>::TypedWorkload;
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
 
 using RefTransposeBFloat16Workload = RefTransposeWorkload<DataType::BFloat16>;
diff --git a/src/backends/reference/workloads/Splitter.cpp b/src/backends/reference/workloads/Splitter.cpp
index 09edc5e..ed6d2b8 100644
--- a/src/backends/reference/workloads/Splitter.cpp
+++ b/src/backends/reference/workloads/Splitter.cpp
@@ -18,12 +18,14 @@
 namespace armnn
 {
 
-void Split(const SplitterQueueDescriptor& data)
+void Split(const SplitterQueueDescriptor& data,
+           std::vector<ITensorHandle*> inputs,
+           std::vector<ITensorHandle*> outputs)
 {
-    const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
 
     std::unique_ptr<Decoder<float>> decoderPtr =
-        MakeDecoder<float>(inputInfo, data.m_Inputs[0]->Map());
+        MakeDecoder<float>(inputInfo, inputs[0]->Map());
     Decoder<float>& decoder = *decoderPtr;
 
     for (unsigned int index = 0; index < inputInfo.GetNumElements(); ++index)
@@ -45,7 +47,7 @@
             SplitterQueueDescriptor::ViewOrigin const& view = data.m_ViewOrigins[viewIdx];
 
             //Split view extents are defined by the size of (the corresponding) input tensor.
-            const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[viewIdx]);
+            const TensorInfo& outputInfo = GetTensorInfo(outputs[viewIdx]);
             ARMNN_ASSERT(outputInfo.GetNumDimensions() == inputInfo.GetNumDimensions());
 
             // Check all dimensions to see if this element is inside the given input view.
@@ -65,7 +67,7 @@
             if (insideView)
             {
                 std::unique_ptr<Encoder<float>> encoderPtr =
-                    MakeEncoder<float>(outputInfo, data.m_Outputs[viewIdx]->Map());
+                    MakeEncoder<float>(outputInfo, outputs[viewIdx]->Map());
                 Encoder<float>& encoder = *encoderPtr;
 
                 unsigned int outIndex = 0;
diff --git a/src/backends/reference/workloads/Splitter.hpp b/src/backends/reference/workloads/Splitter.hpp
index aff4bca..e38a054 100644
--- a/src/backends/reference/workloads/Splitter.hpp
+++ b/src/backends/reference/workloads/Splitter.hpp
@@ -14,9 +14,11 @@
 {
 
 template <typename DataType>
-void Splitter(const SplitterQueueDescriptor& data)
+void Splitter(const SplitterQueueDescriptor& data,
+              std::vector<ITensorHandle*> inputs,
+              std::vector<ITensorHandle*> outputs)
 {
-    const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]);
+    const TensorInfo& inputInfo0 = GetTensorInfo(inputs[0]);
 
     for (unsigned int index = 0; index < inputInfo0.GetNumElements(); ++index)
     {
@@ -37,7 +39,7 @@
             SplitterQueueDescriptor::ViewOrigin const& view = data.m_ViewOrigins[viewIdx];
 
             //Split view extents are defined by the size of (the corresponding) input tensor.
-            const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[viewIdx]);
+            const TensorInfo& outputInfo = GetTensorInfo(outputs[viewIdx]);
             ARMNN_ASSERT(outputInfo.GetNumDimensions() == inputInfo0.GetNumDimensions());
 
             // Check all dimensions to see if this element is inside the given input view.
@@ -78,5 +80,7 @@
     }
 }
 
-void Split(const SplitterQueueDescriptor& data);
+void Split(const SplitterQueueDescriptor& data,
+           std::vector<ITensorHandle*> inputs,
+           std::vector<ITensorHandle*> outputs);
 } //namespace armnn