IVGCVSW-5482 'Add a ClCompileContext parameter to each ClWorkload Constructor'

* Injected CLCompileContext object to each CL workload.

Signed-off-by: Sadik Armagan <sadik.armagan@arm.com>
Change-Id: I4837dbd3d5b56cf743b3b89c944e3cdf8b11a42a
diff --git a/src/backends/cl/ClWorkloadFactory.cpp b/src/backends/cl/ClWorkloadFactory.cpp
index 41b779f..35186f2 100644
--- a/src/backends/cl/ClWorkloadFactory.cpp
+++ b/src/backends/cl/ClWorkloadFactory.cpp
@@ -107,9 +107,8 @@
 void ClWorkloadFactory::InitializeCLCompileContext()
 {
     // Initialize our m_CLCompileContext using default device and context
-    cl::Device device = cl::Device::getDefault();
-    cl::Context context = cl::Context(device);
-
+    auto context = arm_compute::CLKernelLibrary::get().context();
+    auto device  = arm_compute::CLKernelLibrary::get().get_device();
     m_CLCompileContext = arm_compute::CLCompileContext(context, device);
 
     if (m_ModelContextPtr)
@@ -200,64 +199,64 @@
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClActivationWorkload>(descriptor, info);
+    return MakeWorkload<ClActivationWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClAdditionWorkload>(descriptor, info);
+    return MakeWorkload<ClAdditionWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateArgMinMax(const ArgMinMaxQueueDescriptor& descriptor,
                                                               const WorkloadInfo& info) const
 {
-    return std::make_unique<ClArgMinMaxWorkload>(descriptor, info);
+    return std::make_unique<ClArgMinMaxWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateBatchNormalization(
     const BatchNormalizationQueueDescriptor& descriptor,
     const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClBatchNormalizationFloatWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<ClBatchNormalizationFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateBatchToSpaceNd(const BatchToSpaceNdQueueDescriptor& descriptor,
                                                                    const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClBatchToSpaceNdWorkload>(descriptor, info);
+    return MakeWorkload<ClBatchToSpaceNdWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateComparison(const ComparisonQueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClComparisonWorkload>(descriptor, info);
+    return MakeWorkload<ClComparisonWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConcat(const ConcatQueueDescriptor& descriptor,
                                                            const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClConcatWorkload>(descriptor, info);
+    return MakeWorkload<ClConcatWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClConstantWorkload>(descriptor, info);
+    return MakeWorkload<ClConstantWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp16ToFp32(
     const ConvertFp16ToFp32QueueDescriptor& descriptor,
     const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClConvertFp16ToFp32Workload>(descriptor, info);
+    return MakeWorkload<ClConvertFp16ToFp32Workload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp32ToFp16(
     const ConvertFp32ToFp16QueueDescriptor& descriptor,
     const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClConvertFp32ToFp16Workload>(descriptor, info);
+    return MakeWorkload<ClConvertFp32ToFp16Workload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvolution2d(const Convolution2dQueueDescriptor& descriptor,
@@ -278,45 +277,46 @@
     return MakeWorkload<ClConvolution2dWorkload>(descriptor,
                                                  info,
                                                  m_MemoryManager->GetIntraLayerManager(),
+                                                 m_CLCompileContext,
                                                  isFastMathEnabled);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDebug(const DebugQueueDescriptor& descriptor,
                                                           const WorkloadInfo& info) const
 {
-    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDepthToSpace(const DepthToSpaceQueueDescriptor& descriptor,
                                                                  const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClDepthToSpaceWorkload>(descriptor, info);
+    return MakeWorkload<ClDepthToSpaceWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDepthwiseConvolution2d(
     const DepthwiseConvolution2dQueueDescriptor& descriptor,
     const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClDepthwiseConvolutionWorkload>(descriptor, info);
+    return MakeWorkload<ClDepthwiseConvolutionWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDequantize(const DequantizeQueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClDequantizeWorkload>(descriptor, info);
+    return MakeWorkload<ClDequantizeWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDetectionPostProcess(
     const DetectionPostProcessQueueDescriptor& descriptor,
     const WorkloadInfo& info) const
 {
-    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDivision(const DivisionQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClDivisionFloatWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<ClDivisionFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateElementwiseUnary(const ElementwiseUnaryQueueDescriptor& descriptor,
@@ -330,22 +330,22 @@
             absQueueDescriptor.m_Inputs  = descriptor.m_Inputs;
             absQueueDescriptor.m_Outputs = descriptor.m_Outputs;
 
-            return  std::make_unique<ClAbsWorkload>(absQueueDescriptor, info);
+            return  std::make_unique<ClAbsWorkload>(absQueueDescriptor, info, m_CLCompileContext);
         }
         case UnaryOperation::Exp:
-            return std::make_unique<ClExpWorkload>(descriptor, info);
+            return std::make_unique<ClExpWorkload>(descriptor, info, m_CLCompileContext);
         case UnaryOperation::Neg:
-            return std::make_unique<ClNegWorkload>(descriptor, info);
+            return std::make_unique<ClNegWorkload>(descriptor, info, m_CLCompileContext);
         case UnaryOperation::Rsqrt:
         {
             RsqrtQueueDescriptor rsqrtQueueDescriptor;
             rsqrtQueueDescriptor.m_Inputs  = descriptor.m_Inputs;
             rsqrtQueueDescriptor.m_Outputs = descriptor.m_Outputs;
 
-            return std::make_unique<ClRsqrtWorkload>(rsqrtQueueDescriptor, info);
+            return std::make_unique<ClRsqrtWorkload>(rsqrtQueueDescriptor, info, m_CLCompileContext);
         }
         case UnaryOperation::LogicalNot:
-            return std::make_unique<ClLogicalNotWorkload>(descriptor, info);
+            return std::make_unique<ClLogicalNotWorkload>(descriptor, info, m_CLCompileContext);
         default:
             return nullptr;
     }
@@ -365,25 +365,28 @@
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFill(const FillQueueDescriptor& descriptor,
                                                          const WorkloadInfo& info) const
 {
-    return std::make_unique<ClFillWorkload>(descriptor, info);
+    return std::make_unique<ClFillWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
                                                           const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClFloorFloatWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<ClFloorFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFullyConnected(const FullyConnectedQueueDescriptor& descriptor,
                                                                    const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClFullyConnectedWorkload>(descriptor, info, m_MemoryManager->GetIntraLayerManager());
+    return MakeWorkload<ClFullyConnectedWorkload>(descriptor,
+                                                  info,
+                                                  m_MemoryManager->GetIntraLayerManager(),
+                                                  m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateGather(const GatherQueueDescriptor& descriptor,
                                                            const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClGatherWorkload>(descriptor, info);
+    return MakeWorkload<ClGatherWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateGreater(const GreaterQueueDescriptor& descriptor,
@@ -407,13 +410,13 @@
     const InstanceNormalizationQueueDescriptor& descriptor,
     const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClInstanceNormalizationWorkload>(descriptor, info);
+    return MakeWorkload<ClInstanceNormalizationWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
                                                                     const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClL2NormalizationFloatWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<ClL2NormalizationFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLogicalBinary(const LogicalBinaryQueueDescriptor& descriptor,
@@ -422,9 +425,9 @@
     switch(descriptor.m_Parameters.m_Operation)
     {
         case LogicalBinaryOperation::LogicalAnd:
-            return std::make_unique<ClLogicalAndWorkload>(descriptor, info);
+            return std::make_unique<ClLogicalAndWorkload>(descriptor, info, m_CLCompileContext);
         case LogicalBinaryOperation::LogicalOr:
-            return std::make_unique<ClLogicalOrWorkload>(descriptor, info);
+            return std::make_unique<ClLogicalOrWorkload>(descriptor, info, m_CLCompileContext);
         default:
             return nullptr;
     }
@@ -433,25 +436,28 @@
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLogSoftmax(const LogSoftmaxQueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClLogSoftmaxWorkload>(descriptor, info, m_MemoryManager->GetIntraLayerManager());
+    return MakeWorkload<ClLogSoftmaxWorkload>(descriptor,
+                                              info,
+                                              m_MemoryManager->GetIntraLayerManager(),
+                                              m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
                                                          const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClLstmFloatWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<ClLstmFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMaximum(const MaximumQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClMaximumWorkload>(descriptor, info);
+    return MakeWorkload<ClMaximumWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMean(const MeanQueueDescriptor& descriptor,
                                                          const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClMeanWorkload>(descriptor, info);
+    return MakeWorkload<ClMeanWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
@@ -485,19 +491,19 @@
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMinimum(const MinimumQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClMinimumWorkload>(descriptor, info);
+    return MakeWorkload<ClMinimumWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMultiplication(const MultiplicationQueueDescriptor& descriptor,
                                                                    const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClMultiplicationWorkload>(descriptor, info);
+    return MakeWorkload<ClMultiplicationWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateNormalization(const NormalizationQueueDescriptor& descriptor,
                                                                   const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClNormalizationFloatWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<ClNormalizationFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
@@ -509,61 +515,61 @@
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreatePad(const PadQueueDescriptor& descriptor,
                                                         const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClPadWorkload>(descriptor, info);
+    return MakeWorkload<ClPadWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClPermuteWorkload>(descriptor, info);
+    return MakeWorkload<ClPermuteWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
                                                               const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClPooling2dWorkload>(descriptor, info);
+    return MakeWorkload<ClPooling2dWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreatePreCompiled(const PreCompiledQueueDescriptor& descriptor,
                                                                 const WorkloadInfo& info) const
 {
-    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreatePrelu(const PreluQueueDescriptor &descriptor,
                                                           const WorkloadInfo &info) const
 {
-    return MakeWorkload<ClPreluWorkload>(descriptor, info);
+    return MakeWorkload<ClPreluWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateQLstm(const QLstmQueueDescriptor& descriptor,
                                                           const WorkloadInfo& info) const
 {
-    return std::make_unique<ClQLstmWorkload>(descriptor, info);
+    return std::make_unique<ClQLstmWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateQuantize(const QuantizeQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClQuantizeWorkload>(descriptor, info);
+    return MakeWorkload<ClQuantizeWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateQuantizedLstm(const QuantizedLstmQueueDescriptor& descriptor,
                                                                   const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClQuantizedLstmWorkload>(descriptor, info);
+    return MakeWorkload<ClQuantizedLstmWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClReshapeWorkload>(descriptor, info);
+    return MakeWorkload<ClReshapeWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateResize(const ResizeQueueDescriptor& descriptor,
                                                            const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClResizeWorkload>(descriptor, info);
+    return MakeWorkload<ClResizeWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor,
@@ -595,62 +601,68 @@
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSlice(const SliceQueueDescriptor& descriptor,
                                                           const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClSliceWorkload>(descriptor, info);
+    return MakeWorkload<ClSliceWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const
 {
-    return std::make_unique<ClSoftmaxWorkload>(descriptor, info, m_MemoryManager->GetIntraLayerManager());
+    return std::make_unique<ClSoftmaxWorkload>(descriptor,
+                                               info,
+                                               m_MemoryManager->GetIntraLayerManager(),
+                                               m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSpaceToBatchNd(const SpaceToBatchNdQueueDescriptor& descriptor,
                                                                    const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClSpaceToBatchNdWorkload>(descriptor, info);
+    return MakeWorkload<ClSpaceToBatchNdWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSpaceToDepth(const SpaceToDepthQueueDescriptor& descriptor,
                                                                  const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClSpaceToDepthWorkload>(descriptor, info);
+    return MakeWorkload<ClSpaceToDepthWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClSplitterWorkload>(descriptor, info);
+    return MakeWorkload<ClSplitterWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateStack(const StackQueueDescriptor& descriptor,
                                                           const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClStackWorkload>(descriptor, info);
+    return MakeWorkload<ClStackWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateStridedSlice(const StridedSliceQueueDescriptor& descriptor,
                                                                  const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClStridedSliceWorkload>(descriptor, info);
+    return MakeWorkload<ClStridedSliceWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
                                                                 const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClSubtractionWorkload>(descriptor, info);
+    return MakeWorkload<ClSubtractionWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateTranspose(const TransposeQueueDescriptor& descriptor,
                                                               const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClTransposeWorkload>(descriptor, info);
+    return MakeWorkload<ClTransposeWorkload>(descriptor, info, m_CLCompileContext);
 }
 
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateTransposeConvolution2d(
     const TransposeConvolution2dQueueDescriptor& descriptor,
     const WorkloadInfo& info) const
 {
-    return MakeWorkload<ClTransposeConvolution2dWorkload>(descriptor, info, m_MemoryManager->GetIntraLayerManager());
+    return MakeWorkload<ClTransposeConvolution2dWorkload>(descriptor,
+                                                          info,
+                                                          m_MemoryManager->GetIntraLayerManager(),
+                                                          m_CLCompileContext);
 }
 
 } // namespace armnn
diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp
index 4bd3d3a..765409a 100644
--- a/src/backends/cl/test/ClCreateWorkloadTests.cpp
+++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp
@@ -10,6 +10,8 @@
 #include <armnn/utility/IgnoreUnused.hpp>
 #include <armnn/utility/PolymorphicDowncast.hpp>
 #include <backendsCommon/MemCopyWorkload.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
 
 #include <aclCommon/test/CreateWorkloadClNeon.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
@@ -334,6 +336,98 @@
     ARMNN_ASSERT(conv2dWorkload->GetConvolutionMethod() == arm_compute::ConvolutionMethod::WINOGRAD);
 }
 
+BOOST_AUTO_TEST_CASE(CreateConvolution2dClCompiledContextWorkload)
+{
+    using namespace armnn;
+
+    const DataType inputType  = DataType::QAsymmU8;
+    const DataType kernelType = DataType::QSymmS8;
+    const DataType biasType   = DataType::Signed32;
+
+    TensorInfo inputInfo ({ 1, 3, 1, 2 }, inputType, 0.5f, 128);
+    TensorInfo outputInfo({ 1, 3, 1, 3 }, inputType, 1.0f, 128);
+
+    const std::vector<float> quantScales{ 0.5f, 0.75f, 1.0f };
+    constexpr unsigned int quantDimension = 0;
+
+    TensorInfo kernelInfo({ 3, 1, 1, 2 }, kernelType, quantScales, quantDimension);
+
+    const std::vector<float> biasQuantScales{ 0.25f, 0.375f, 0.5f };
+    TensorInfo biasInfo({ 3 }, biasType, biasQuantScales, quantDimension);
+
+    std::vector<uint8_t> inputData =
+    {
+        138, 108, 138, 108, 138, 108
+    };
+
+    std::vector<int8_t> kernelData =
+    {
+        1, 2, 1, 2, 1, 2
+    };
+
+    std::vector<int32_t> biasData =
+    {
+        4, 4, 4
+    };
+
+    std::vector<uint8_t> expectedOutputData =
+    {
+        121, 118, 115, 121, 118, 115, 121, 118, 115
+    };
+
+
+    Convolution2dDescriptor descriptor;
+    descriptor.m_StrideX     = 1;
+    descriptor.m_StrideY     = 1;
+    descriptor.m_PadLeft     = 0;
+    descriptor.m_PadRight    = 0;
+    descriptor.m_PadTop      = 0;
+    descriptor.m_PadBottom   = 0;
+    descriptor.m_BiasEnabled = true;
+    descriptor.m_DataLayout  = DataLayout::NHWC;
+
+    auto memoryManager = ClWorkloadFactoryHelper::GetMemoryManager();
+    auto clMemoryManager = armnn::PolymorphicPointerDowncast<armnn::ClMemoryManager>(memoryManager);
+    auto tensorHandleFactory = ClWorkloadFactoryHelper::GetTensorHandleFactory(memoryManager);
+
+    std::unique_ptr<ITensorHandle> inputHandle  = tensorHandleFactory.CreateTensorHandle(inputInfo);
+    std::unique_ptr<ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputInfo);
+
+
+    WorkloadInfo workloadInfo;
+    ScopedCpuTensorHandle weightTensor(kernelInfo);
+    ScopedCpuTensorHandle biasTensor(biasInfo);
+
+    AllocateAndCopyDataToITensorHandle(&weightTensor, kernelData.data());
+    AllocateAndCopyDataToITensorHandle(&biasTensor, biasData.data());
+
+    Convolution2dQueueDescriptor queueDescriptor;
+    queueDescriptor.m_Parameters = descriptor;
+    queueDescriptor.m_Weight     = &weightTensor;
+    queueDescriptor.m_Bias       = &biasTensor;
+
+    AddInputToWorkload(queueDescriptor, workloadInfo, inputInfo, inputHandle.get());
+    AddOutputToWorkload(queueDescriptor, workloadInfo, outputInfo, outputHandle.get());
+
+    // Initialize our m_CLCompileContext using default device and context
+    auto context = arm_compute::CLKernelLibrary::get().context();
+    auto device  = arm_compute::CLKernelLibrary::get().get_device();
+    auto clCompileContext = arm_compute::CLCompileContext(context, device);
+
+
+
+    // Check built programs are empty in context
+    BOOST_TEST(clCompileContext.get_built_programs().empty());
+
+    auto workload = std::make_unique<ClConvolution2dWorkload>(queueDescriptor,
+                                                              workloadInfo,
+                                                              clMemoryManager->GetIntraLayerManager(),
+                                                              clCompileContext);
+    ARMNN_ASSERT(workload != nullptr);
+    // Check built programs are not empty in context
+    BOOST_TEST(!clCompileContext.get_built_programs().empty());
+}
+
 template <typename DepthwiseConvolutionWorkloadType, typename armnn::DataType DataType>
 static void ClDepthwiseConvolutionWorkloadTest(DataLayout dataLayout)
 {
diff --git a/src/backends/cl/workloads/ClAbsWorkload.cpp b/src/backends/cl/workloads/ClAbsWorkload.cpp
index 858ef5b..4682c64 100644
--- a/src/backends/cl/workloads/ClAbsWorkload.cpp
+++ b/src/backends/cl/workloads/ClAbsWorkload.cpp
@@ -24,7 +24,9 @@
     return arm_compute::CLAbsLayer::validate(&aclInput, &aclOutput);
 }
 
-ClAbsWorkload::ClAbsWorkload(const AbsQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClAbsWorkload::ClAbsWorkload(const AbsQueueDescriptor& descriptor,
+                             const WorkloadInfo& info,
+                             const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<AbsQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClAbsWorkload", 1, 1);
@@ -32,7 +34,7 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_AbsLayer.configure(&input, &output);
+    m_AbsLayer.configure(clCompileContext, &input, &output);
 }
 
 void ClAbsWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClAbsWorkload.hpp b/src/backends/cl/workloads/ClAbsWorkload.hpp
index 763cafc..d0f7d16 100644
--- a/src/backends/cl/workloads/ClAbsWorkload.hpp
+++ b/src/backends/cl/workloads/ClAbsWorkload.hpp
@@ -18,7 +18,9 @@
 class ClAbsWorkload : public BaseWorkload<AbsQueueDescriptor>
 {
 public:
-    ClAbsWorkload(const AbsQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClAbsWorkload(const AbsQueueDescriptor& descriptor,
+                  const WorkloadInfo& info,
+                  const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClActivationWorkload.cpp b/src/backends/cl/workloads/ClActivationWorkload.cpp
index 6856520..8997a97 100644
--- a/src/backends/cl/workloads/ClActivationWorkload.cpp
+++ b/src/backends/cl/workloads/ClActivationWorkload.cpp
@@ -30,7 +30,8 @@
 }
 
 ClActivationWorkload::ClActivationWorkload(const ActivationQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info)
+                                           const WorkloadInfo& info,
+                                           const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<ActivationQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClActivationWorkload", 1, 1);
@@ -40,7 +41,7 @@
 
     arm_compute::ICLTensor& input  = static_cast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
-    m_ActivationLayer.configure(&input, &output, activationLayerInfo);
+    m_ActivationLayer.configure(clCompileContext, &input, &output, activationLayerInfo);
 }
 
 void ClActivationWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClActivationWorkload.hpp b/src/backends/cl/workloads/ClActivationWorkload.hpp
index 3516633..6b71e86 100644
--- a/src/backends/cl/workloads/ClActivationWorkload.hpp
+++ b/src/backends/cl/workloads/ClActivationWorkload.hpp
@@ -18,7 +18,9 @@
 class ClActivationWorkload : public BaseWorkload<ActivationQueueDescriptor>
 {
 public:
-    ClActivationWorkload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClActivationWorkload(const ActivationQueueDescriptor& descriptor,
+                         const WorkloadInfo& info,
+                         const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClAdditionWorkload.cpp b/src/backends/cl/workloads/ClAdditionWorkload.cpp
index 7e75a04..0ab7446 100644
--- a/src/backends/cl/workloads/ClAdditionWorkload.cpp
+++ b/src/backends/cl/workloads/ClAdditionWorkload.cpp
@@ -19,7 +19,8 @@
 static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
 
 ClAdditionWorkload::ClAdditionWorkload(const AdditionQueueDescriptor& descriptor,
-                                       const WorkloadInfo& info)
+                                       const WorkloadInfo& info,
+                                       const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<AdditionQueueDescriptor>(descriptor, info)
 {
     this->m_Data.ValidateInputsOutputs("ClAdditionWorkload", 2, 1);
@@ -30,7 +31,7 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy, activationInfo);
+    m_Layer.configure(clCompileContext, &input0, &input1, &output, g_AclConvertPolicy, activationInfo);
 }
 
 void ClAdditionWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClAdditionWorkload.hpp b/src/backends/cl/workloads/ClAdditionWorkload.hpp
index 372c4bc..cd25be1 100644
--- a/src/backends/cl/workloads/ClAdditionWorkload.hpp
+++ b/src/backends/cl/workloads/ClAdditionWorkload.hpp
@@ -15,7 +15,9 @@
 class ClAdditionWorkload : public BaseWorkload<AdditionQueueDescriptor>
 {
 public:
-    ClAdditionWorkload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClAdditionWorkload(const AdditionQueueDescriptor& descriptor,
+                       const WorkloadInfo& info,
+                       const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClArgMinMaxWorkload.cpp b/src/backends/cl/workloads/ClArgMinMaxWorkload.cpp
index 5910080..8974930 100644
--- a/src/backends/cl/workloads/ClArgMinMaxWorkload.cpp
+++ b/src/backends/cl/workloads/ClArgMinMaxWorkload.cpp
@@ -53,7 +53,8 @@
 
 
 ClArgMinMaxWorkload::ClArgMinMaxWorkload(const ArgMinMaxQueueDescriptor& descriptor,
-                                         const WorkloadInfo& info)
+                                         const WorkloadInfo& info,
+                                         const arm_compute::CLCompileContext& clCompileContext)
         : BaseWorkload<ArgMinMaxQueueDescriptor>(descriptor, info)
 {
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
@@ -69,7 +70,11 @@
     }
     else
     {
-        m_ArgMinMaxLayer.configure(&input, aclAxis, &output, arm_compute::ReductionOperation::ARG_IDX_MIN);
+        m_ArgMinMaxLayer.configure(clCompileContext,
+                                   &input,
+                                   aclAxis,
+                                   &output,
+                                   arm_compute::ReductionOperation::ARG_IDX_MIN);
     }
 }
 
diff --git a/src/backends/cl/workloads/ClArgMinMaxWorkload.hpp b/src/backends/cl/workloads/ClArgMinMaxWorkload.hpp
index 54f28e6..3ec137d 100644
--- a/src/backends/cl/workloads/ClArgMinMaxWorkload.hpp
+++ b/src/backends/cl/workloads/ClArgMinMaxWorkload.hpp
@@ -20,7 +20,9 @@
 class ClArgMinMaxWorkload : public BaseWorkload<ArgMinMaxQueueDescriptor>
 {
 public:
-    ClArgMinMaxWorkload(const ArgMinMaxQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClArgMinMaxWorkload(const ArgMinMaxQueueDescriptor& descriptor,
+                        const WorkloadInfo& info,
+                        const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
index c595e20..daaed17 100644
--- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
@@ -52,7 +52,9 @@
 }
 
 ClBatchNormalizationFloatWorkload::ClBatchNormalizationFloatWorkload(
-    const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info)
+    const BatchNormalizationQueueDescriptor& descriptor,
+    const WorkloadInfo& info,
+    const arm_compute::CLCompileContext& clCompileContext)
     : FloatWorkload<BatchNormalizationQueueDescriptor>(descriptor, info)
 {
     m_Mean = std::make_unique<arm_compute::CLTensor>();
@@ -78,7 +80,8 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    m_Layer.configure(&input,
+    m_Layer.configure(clCompileContext,
+                      &input,
                       &output,
                       m_Mean.get(),
                       m_Variance.get(),
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
index ef57783..c9f1f7f 100644
--- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
@@ -25,7 +25,9 @@
 class ClBatchNormalizationFloatWorkload : public FloatWorkload<BatchNormalizationQueueDescriptor>
 {
 public:
-    ClBatchNormalizationFloatWorkload(const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClBatchNormalizationFloatWorkload(const BatchNormalizationQueueDescriptor& descriptor,
+                                      const WorkloadInfo& info,
+                                      const arm_compute::CLCompileContext& clCompileContext);
 
     using FloatWorkload<BatchNormalizationQueueDescriptor>::FloatWorkload;
     void Execute() const override;
diff --git a/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.cpp b/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.cpp
index 1a7a8dc..8978c5a 100644
--- a/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.cpp
+++ b/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.cpp
@@ -18,8 +18,9 @@
 using namespace armcomputetensorutils;
 
 ClBatchToSpaceNdWorkload::ClBatchToSpaceNdWorkload(const BatchToSpaceNdQueueDescriptor& desc,
-                                                   const WorkloadInfo& info)
-                                                   : BaseWorkload<BatchToSpaceNdQueueDescriptor>(desc, info)
+                                                   const WorkloadInfo& info,
+                                                   const arm_compute::CLCompileContext& clCompileContext)
+   : BaseWorkload<BatchToSpaceNdQueueDescriptor>(desc, info)
 {
     m_Data.ValidateInputsOutputs("ClBatchToSpaceNdWorkload", 1, 1);
 
@@ -35,7 +36,7 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     output.info()->set_data_layout(aclDataLayout);
 
-    m_Layer.configure(&input, blockWidth, blockHeight, &output);
+    m_Layer.configure(clCompileContext, &input, blockWidth, blockHeight, &output);
 }
 
 void ClBatchToSpaceNdWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.hpp b/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.hpp
index 881b294..2262f33 100644
--- a/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.hpp
+++ b/src/backends/cl/workloads/ClBatchToSpaceNdWorkload.hpp
@@ -18,7 +18,9 @@
 class ClBatchToSpaceNdWorkload : public BaseWorkload<BatchToSpaceNdQueueDescriptor>
 {
 public:
-    ClBatchToSpaceNdWorkload(const BatchToSpaceNdQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClBatchToSpaceNdWorkload(const BatchToSpaceNdQueueDescriptor& descriptor,
+                             const WorkloadInfo& info,
+                             const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClComparisonWorkload.cpp b/src/backends/cl/workloads/ClComparisonWorkload.cpp
index 30b336d..20e5669 100644
--- a/src/backends/cl/workloads/ClComparisonWorkload.cpp
+++ b/src/backends/cl/workloads/ClComparisonWorkload.cpp
@@ -39,7 +39,9 @@
     return aclStatus;
 }
 
-ClComparisonWorkload::ClComparisonWorkload(const ComparisonQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClComparisonWorkload::ClComparisonWorkload(const ComparisonQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info,
+                                           const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<ComparisonQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClComparisonWorkload", 2, 1);
@@ -50,7 +52,7 @@
 
     const arm_compute::ComparisonOperation comparisonOperation = ConvertComparisonOperationToAcl(m_Data.m_Parameters);
 
-    m_ComparisonLayer.configure(&input0, &input1, &output, comparisonOperation);
+    m_ComparisonLayer.configure(clCompileContext, &input0, &input1, &output, comparisonOperation);
 }
 
 void ClComparisonWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClComparisonWorkload.hpp b/src/backends/cl/workloads/ClComparisonWorkload.hpp
index e842152..4a92e6b 100644
--- a/src/backends/cl/workloads/ClComparisonWorkload.hpp
+++ b/src/backends/cl/workloads/ClComparisonWorkload.hpp
@@ -20,7 +20,9 @@
 class ClComparisonWorkload : public BaseWorkload<ComparisonQueueDescriptor>
 {
 public:
-    ClComparisonWorkload(const ComparisonQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClComparisonWorkload(const ComparisonQueueDescriptor& descriptor,
+                         const WorkloadInfo& info,
+                         const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClConcatWorkload.cpp b/src/backends/cl/workloads/ClConcatWorkload.cpp
index da0d675..3c5f237 100644
--- a/src/backends/cl/workloads/ClConcatWorkload.cpp
+++ b/src/backends/cl/workloads/ClConcatWorkload.cpp
@@ -11,7 +11,6 @@
 #include <cl/ClLayerSupport.hpp>
 
 #include <arm_compute/core/Types.h>
-#include <arm_compute/runtime/CL/functions/CLConcatenateLayer.h>
 
 namespace armnn
 {
@@ -46,7 +45,9 @@
     return arm_compute::CLConcatenateLayer::validate(aclInputPtrs, &aclOutputInfo, aclAxis);
 }
 
-ClConcatWorkload::ClConcatWorkload(const ConcatQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClConcatWorkload::ClConcatWorkload(const ConcatQueueDescriptor& descriptor,
+                                   const WorkloadInfo& info,
+                                   const arm_compute::CLCompileContext& clCompileContext)
 : BaseWorkload<ConcatQueueDescriptor>(descriptor, info)
 {
     bool allInputsAreSubtensors = true;
@@ -83,7 +84,7 @@
 
     // Configure input and output tensors
     size_t aclAxis = CalcAxis(descriptor.m_Parameters);
-    layer->configure(aclInputs, &output, aclAxis);
+    layer->configure(clCompileContext, aclInputs, &output, aclAxis);
 
     // Prepare
     layer->prepare();
diff --git a/src/backends/cl/workloads/ClConcatWorkload.hpp b/src/backends/cl/workloads/ClConcatWorkload.hpp
index 772bc09..3120b42 100644
--- a/src/backends/cl/workloads/ClConcatWorkload.hpp
+++ b/src/backends/cl/workloads/ClConcatWorkload.hpp
@@ -9,6 +9,7 @@
 
 #include <arm_compute/core/Error.h>
 #include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/CL/functions/CLConcatenateLayer.h>
 
 namespace armnn
 {
@@ -20,7 +21,9 @@
 class ClConcatWorkload : public BaseWorkload<ConcatQueueDescriptor>
 {
 public:
-    ClConcatWorkload(const ConcatQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClConcatWorkload(const ConcatQueueDescriptor& descriptor,
+                     const WorkloadInfo& info,
+                     const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClConstantWorkload.cpp b/src/backends/cl/workloads/ClConstantWorkload.cpp
index d6b5c57..40acb8e 100644
--- a/src/backends/cl/workloads/ClConstantWorkload.cpp
+++ b/src/backends/cl/workloads/ClConstantWorkload.cpp
@@ -41,7 +41,9 @@
     }
 }
 
-ClConstantWorkload::ClConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClConstantWorkload::ClConstantWorkload(const ConstantQueueDescriptor& descriptor,
+                                       const WorkloadInfo& info,
+                                       const arm_compute::CLCompileContext&)
     : BaseWorkload<ConstantQueueDescriptor>(descriptor, info)
     , m_RanOnce(false)
 {
diff --git a/src/backends/cl/workloads/ClConstantWorkload.hpp b/src/backends/cl/workloads/ClConstantWorkload.hpp
index e5a1d44..8fa5d63 100644
--- a/src/backends/cl/workloads/ClConstantWorkload.hpp
+++ b/src/backends/cl/workloads/ClConstantWorkload.hpp
@@ -8,6 +8,8 @@
 #include <arm_compute/core/Error.h>
 #include <backendsCommon/Workload.hpp>
 
+#include <arm_compute/core/CL/CLCompileContext.h>
+
 namespace armnn
 {
 arm_compute::Status ClConstantWorkloadValidate(const TensorInfo& output);
@@ -15,7 +17,9 @@
 class ClConstantWorkload : public BaseWorkload<ConstantQueueDescriptor>
 {
 public:
-    ClConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClConstantWorkload(const ConstantQueueDescriptor& descriptor,
+                       const WorkloadInfo& info,
+                       const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
index d2e86f8..aaffd83 100644
--- a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
@@ -15,7 +15,9 @@
 static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
 
 ClConvertFp16ToFp32Workload::ClConvertFp16ToFp32Workload(
-    const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info) :
+    const ConvertFp16ToFp32QueueDescriptor& descriptor,
+    const WorkloadInfo& info,
+    const arm_compute::CLCompileContext& clCompileContext) :
     Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info)
 {
     this->m_Data.ValidateInputsOutputs("ClConvertFp16ToFp32Workload", 1, 1);
@@ -23,7 +25,7 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(&input, &output, g_AclConvertPolicy, 0);
+    m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
 }
 
 void ClConvertFp16ToFp32Workload::Execute() const
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
index ef5c9b6..e8e6c98 100644
--- a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
@@ -16,7 +16,9 @@
 {
 public:
 
-    ClConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+                                const WorkloadInfo& info,
+                                const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
index 3f528a1..a9f1d91 100644
--- a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
@@ -15,7 +15,9 @@
 static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
 
 ClConvertFp32ToFp16Workload::ClConvertFp32ToFp16Workload(
-    const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info) :
+    const ConvertFp32ToFp16QueueDescriptor& descriptor,
+    const WorkloadInfo& info,
+    const arm_compute::CLCompileContext& clCompileContext) :
     Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info)
 {
     this->m_Data.ValidateInputsOutputs("ClConvertFp32ToFp16Workload", 1, 1);
@@ -23,7 +25,7 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(&input, &output, g_AclConvertPolicy, 0);
+    m_Layer.configure(clCompileContext, &input, &output, g_AclConvertPolicy, 0);
 }
 
 void ClConvertFp32ToFp16Workload::Execute() const
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
index 6e04e39..17eac7d 100644
--- a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
@@ -16,7 +16,9 @@
 {
 public:
 
-    ClConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+                                const WorkloadInfo& info,
+                                const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
index 50cb9de..99a981b 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
@@ -65,6 +65,7 @@
 ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescriptor& descriptor,
                                                  const WorkloadInfo& info,
                                                  std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+                                                 const arm_compute::CLCompileContext& clCompileContext,
                                                  const bool isFastMathEnabled)
     : BaseWorkload<Convolution2dQueueDescriptor>(descriptor, info)
     , m_ConvolutionLayer(memoryManager)
@@ -97,7 +98,8 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    m_ConvolutionLayer.configure(&input,
+    m_ConvolutionLayer.configure(clCompileContext,
+                                 &input,
                                  m_KernelTensor.get(),
                                  m_BiasTensor.get(),
                                  &output,
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
index 70170b5..d0f7a5b 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
@@ -32,6 +32,7 @@
     ClConvolution2dWorkload(const Convolution2dQueueDescriptor& descriptor,
                             const WorkloadInfo& info,
                             std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+                            const arm_compute::CLCompileContext& clCompileContext,
                             const bool isFastMathEnabled = false);
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClDepthToSpaceWorkload.cpp b/src/backends/cl/workloads/ClDepthToSpaceWorkload.cpp
index 43c81dc..d42b261 100644
--- a/src/backends/cl/workloads/ClDepthToSpaceWorkload.cpp
+++ b/src/backends/cl/workloads/ClDepthToSpaceWorkload.cpp
@@ -37,7 +37,8 @@
 }
 
 ClDepthToSpaceWorkload::ClDepthToSpaceWorkload(const DepthToSpaceQueueDescriptor& desc,
-                                               const WorkloadInfo& info)
+                                               const WorkloadInfo& info,
+                                               const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<DepthToSpaceQueueDescriptor>(desc, info)
 {
     m_Data.ValidateInputsOutputs("ClDepthToSpaceWorkload", 1, 1);
@@ -54,7 +55,7 @@
         PolymorphicPointerDowncast<IClTensorHandle>(m_Data.m_Outputs[0])->GetTensor();
     output.info()->set_data_layout(aclDataLayout);
 
-    m_Layer.configure(&input, &output, blockSize);
+    m_Layer.configure(clCompileContext, &input, &output, blockSize);
 }
 
 void ClDepthToSpaceWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClDepthToSpaceWorkload.hpp b/src/backends/cl/workloads/ClDepthToSpaceWorkload.hpp
index de8b496..6cb8bb5 100644
--- a/src/backends/cl/workloads/ClDepthToSpaceWorkload.hpp
+++ b/src/backends/cl/workloads/ClDepthToSpaceWorkload.hpp
@@ -21,7 +21,9 @@
 class ClDepthToSpaceWorkload : public BaseWorkload<DepthToSpaceQueueDescriptor>
 {
 public:
-    ClDepthToSpaceWorkload(const DepthToSpaceQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClDepthToSpaceWorkload(const DepthToSpaceQueueDescriptor& descriptor,
+                           const WorkloadInfo& info,
+                           const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
index 53f1684..655f0c9 100644
--- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
@@ -75,7 +75,8 @@
 
 ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload(
     const DepthwiseConvolution2dQueueDescriptor& descriptor,
-    const WorkloadInfo& info)
+    const WorkloadInfo& info,
+    const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
     // Allocate a buffer for the swizzling of the weight tensor
@@ -124,6 +125,7 @@
 
     m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
     static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_DepthwiseConvolutionLayer.get())->configure(
+        clCompileContext,
         &input,
         m_KernelTensor.get(),
         m_BiasTensor.get(),
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.hpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.hpp
index c759137..d490012 100644
--- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.hpp
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.hpp
@@ -27,7 +27,8 @@
     using BaseWorkload<DepthwiseConvolution2dQueueDescriptor>::m_Data;
 
     ClDepthwiseConvolutionWorkload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
-                                   const WorkloadInfo& info);
+                                   const WorkloadInfo& info,
+                                   const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClDequantizeWorkload.cpp b/src/backends/cl/workloads/ClDequantizeWorkload.cpp
index eb63900..52d8fab 100644
--- a/src/backends/cl/workloads/ClDequantizeWorkload.cpp
+++ b/src/backends/cl/workloads/ClDequantizeWorkload.cpp
@@ -28,7 +28,8 @@
 }
 
 ClDequantizeWorkload::ClDequantizeWorkload(const DequantizeQueueDescriptor& descriptor,
-                                           const WorkloadInfo& workloadInfo)
+                                           const WorkloadInfo& workloadInfo,
+                                           const arm_compute::CLCompileContext& clCompileContext)
                                            : BaseWorkload<DequantizeQueueDescriptor>(descriptor, workloadInfo)
 {
     m_Data.ValidateInputsOutputs("ClDequantizeWorkload", 1, 1);
@@ -40,7 +41,7 @@
             m_Data.m_Outputs[0])->GetTensor();
 
     m_Layer.reset(new arm_compute::CLDequantizationLayer());
-    m_Layer->configure(&input, &output);
+    m_Layer->configure(clCompileContext, &input, &output);
     m_Layer->prepare();
 }
 
diff --git a/src/backends/cl/workloads/ClDequantizeWorkload.hpp b/src/backends/cl/workloads/ClDequantizeWorkload.hpp
index 6e61da2..628ea20 100644
--- a/src/backends/cl/workloads/ClDequantizeWorkload.hpp
+++ b/src/backends/cl/workloads/ClDequantizeWorkload.hpp
@@ -17,7 +17,9 @@
 class ClDequantizeWorkload : public BaseWorkload<DequantizeQueueDescriptor>
 {
 public:
-    ClDequantizeWorkload(const DequantizeQueueDescriptor& descriptor, const WorkloadInfo& workloadInfo);
+    ClDequantizeWorkload(const DequantizeQueueDescriptor& descriptor,
+                         const WorkloadInfo& workloadInfo,
+                         const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp b/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp
index c79e55e..3df8d52 100644
--- a/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp
@@ -32,7 +32,8 @@
 
 
 ClDivisionFloatWorkload::ClDivisionFloatWorkload(const DivisionQueueDescriptor& descriptor,
-                                                     const WorkloadInfo& info)
+                                                 const WorkloadInfo& info,
+                                                 const arm_compute::CLCompileContext& clCompileContext)
     : FloatWorkload<DivisionQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClDivisionFloatWorkload", 2, 1);
@@ -43,7 +44,7 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    m_ArithmeticDivision.configure(&input0, &input1, &output, activationInfo);
+    m_ArithmeticDivision.configure(clCompileContext, &input0, &input1, &output, activationInfo);
 }
 
 void ClDivisionFloatWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp b/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp
index 71d27ed..481b8b0 100644
--- a/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp
@@ -20,8 +20,9 @@
 class ClDivisionFloatWorkload : public FloatWorkload<DivisionQueueDescriptor>
 {
 public:
-    ClDivisionFloatWorkload(const DivisionQueueDescriptor& descriptor, const
-    WorkloadInfo& info);
+    ClDivisionFloatWorkload(const DivisionQueueDescriptor& descriptor,
+                            const WorkloadInfo& info,
+                            const arm_compute::CLCompileContext& clCompileContext);
 
     using FloatWorkload<DivisionQueueDescriptor>::FloatWorkload;
     void Execute() const override;
diff --git a/src/backends/cl/workloads/ClExpWorkload.cpp b/src/backends/cl/workloads/ClExpWorkload.cpp
index abf4181..60c383f 100644
--- a/src/backends/cl/workloads/ClExpWorkload.cpp
+++ b/src/backends/cl/workloads/ClExpWorkload.cpp
@@ -23,7 +23,9 @@
     return arm_compute::CLExpLayer::validate(&aclInput, &aclOutput);
 }
 
-ClExpWorkload::ClExpWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClExpWorkload::ClExpWorkload(const ElementwiseUnaryQueueDescriptor& descriptor,
+                             const WorkloadInfo& info,
+                             const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<ElementwiseUnaryQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClExpWorkload", 1, 1);
@@ -31,7 +33,7 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_ExpLayer.configure(&input, &output);
+    m_ExpLayer.configure(clCompileContext, &input, &output);
 }
 
 void ClExpWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClExpWorkload.hpp b/src/backends/cl/workloads/ClExpWorkload.hpp
index c35aebb..407fb5e 100644
--- a/src/backends/cl/workloads/ClExpWorkload.hpp
+++ b/src/backends/cl/workloads/ClExpWorkload.hpp
@@ -18,7 +18,9 @@
 class ClExpWorkload : public BaseWorkload<ElementwiseUnaryQueueDescriptor>
 {
 public:
-    ClExpWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClExpWorkload(const ElementwiseUnaryQueueDescriptor& descriptor,
+                  const WorkloadInfo& info,
+                  const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClFillWorkload.cpp b/src/backends/cl/workloads/ClFillWorkload.cpp
index 47a70bc..a2204fa 100644
--- a/src/backends/cl/workloads/ClFillWorkload.cpp
+++ b/src/backends/cl/workloads/ClFillWorkload.cpp
@@ -15,7 +15,9 @@
 {
 using namespace armcomputetensorutils;
 
-ClFillWorkload::ClFillWorkload(const FillQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClFillWorkload::ClFillWorkload(const FillQueueDescriptor& descriptor,
+                               const WorkloadInfo& info,
+                               const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<FillQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClFillWorkload", 1, 1);
@@ -23,7 +25,7 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
     arm_compute::PixelValue pixelValue = GetPixelValue(output, descriptor.m_Parameters.m_Value);
 
-    m_Layer.configure(&output, pixelValue);
+    m_Layer.configure(clCompileContext, &output, pixelValue);
 }
 
 void ClFillWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClFillWorkload.hpp b/src/backends/cl/workloads/ClFillWorkload.hpp
index 8919445..8539501 100644
--- a/src/backends/cl/workloads/ClFillWorkload.hpp
+++ b/src/backends/cl/workloads/ClFillWorkload.hpp
@@ -14,7 +14,9 @@
 class ClFillWorkload : public BaseWorkload<FillQueueDescriptor>
 {
 public:
-    ClFillWorkload(const FillQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClFillWorkload(const FillQueueDescriptor& descriptor,
+                   const WorkloadInfo& info,
+                   const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
index f38342e..3915270 100644
--- a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
@@ -20,7 +20,9 @@
     return arm_compute::CLFloor::validate(&aclInput, &aclOutput);
 }
 
-ClFloorFloatWorkload::ClFloorFloatWorkload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClFloorFloatWorkload::ClFloorFloatWorkload(const FloorQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info,
+                                           const arm_compute::CLCompileContext& clCompileContext)
     : FloatWorkload<FloorQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClFloorFloatWorkload", 1, 1);
@@ -28,7 +30,7 @@
     arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(&input, &output);
+    m_Layer.configure(clCompileContext, &input, &output);
 }
 
 void ClFloorFloatWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.hpp b/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
index 1ddaddf..1c755c5 100644
--- a/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
@@ -18,7 +18,9 @@
 class ClFloorFloatWorkload : public FloatWorkload<FloorQueueDescriptor>
 {
 public:
-    ClFloorFloatWorkload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClFloorFloatWorkload(const FloorQueueDescriptor& descriptor,
+                         const WorkloadInfo& info,
+                         const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp b/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp
index eaec639..9135d27 100644
--- a/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp
+++ b/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp
@@ -45,8 +45,11 @@
                                                         fullyConnectedLayerInfo);
 }
 
-ClFullyConnectedWorkload::ClFullyConnectedWorkload(const FullyConnectedQueueDescriptor& descriptor,
-    const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ClFullyConnectedWorkload::ClFullyConnectedWorkload(
+    const FullyConnectedQueueDescriptor& descriptor,
+    const WorkloadInfo& info,
+    std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+    const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
     , m_FullyConnectedLayer(memoryManager)
 {
@@ -69,7 +72,12 @@
     arm_compute::FullyConnectedLayerInfo fc_info =
             ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor.m_Parameters, activationInfo);
 
-    m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info);
+    m_FullyConnectedLayer.configure(clCompileContext,
+                                    &input,
+                                    m_WeightsTensor.get(),
+                                    m_BiasesTensor.get(),
+                                    &output,
+                                    fc_info);
 
     InitializeArmComputeClTensorData(*m_WeightsTensor, m_Data.m_Weight);
 
diff --git a/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp b/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp
index 311b594..45394da 100644
--- a/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp
+++ b/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp
@@ -27,7 +27,8 @@
 public:
     ClFullyConnectedWorkload(const armnn::FullyConnectedQueueDescriptor& descriptor,
                              const armnn::WorkloadInfo& info,
-                             std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+                             std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+                             const arm_compute::CLCompileContext& clCompileContext);
 
     using armnn::BaseWorkload<armnn::FullyConnectedQueueDescriptor>::m_Data;
     void Execute() const override;
diff --git a/src/backends/cl/workloads/ClGatherWorkload.cpp b/src/backends/cl/workloads/ClGatherWorkload.cpp
index c76b9c7..98dfe7b 100644
--- a/src/backends/cl/workloads/ClGatherWorkload.cpp
+++ b/src/backends/cl/workloads/ClGatherWorkload.cpp
@@ -27,7 +27,8 @@
 }
 
 ClGatherWorkload::ClGatherWorkload(const GatherQueueDescriptor& descriptor,
-                                   const WorkloadInfo& info)
+                                   const WorkloadInfo& info,
+                                   const arm_compute::CLCompileContext& clCompileContext)
         : BaseWorkload<GatherQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClGatherWorkload", 1, 1);
@@ -38,7 +39,7 @@
 
     int aclAxis = ComputeAclAxis(descriptor.m_Parameters.m_Axis, info.m_InputTensorInfos[0]);
 
-    m_Layer.configure(&input, &indices, &output, aclAxis);
+    m_Layer.configure(clCompileContext, &input, &indices, &output, aclAxis);
 };
 
 void ClGatherWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClGatherWorkload.hpp b/src/backends/cl/workloads/ClGatherWorkload.hpp
index df71a99..8199aaf 100644
--- a/src/backends/cl/workloads/ClGatherWorkload.hpp
+++ b/src/backends/cl/workloads/ClGatherWorkload.hpp
@@ -19,7 +19,9 @@
 class ClGatherWorkload : public BaseWorkload<GatherQueueDescriptor>
 {
 public:
-    ClGatherWorkload(const GatherQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClGatherWorkload(const GatherQueueDescriptor& descriptor,
+                     const WorkloadInfo& info,
+                     const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClInstanceNormalizationWorkload.cpp b/src/backends/cl/workloads/ClInstanceNormalizationWorkload.cpp
index 50cf345..a0e8e7b 100644
--- a/src/backends/cl/workloads/ClInstanceNormalizationWorkload.cpp
+++ b/src/backends/cl/workloads/ClInstanceNormalizationWorkload.cpp
@@ -31,7 +31,8 @@
 
 ClInstanceNormalizationWorkload::ClInstanceNormalizationWorkload(
     const InstanceNormalizationQueueDescriptor& descriptor,
-    const WorkloadInfo& info)
+    const WorkloadInfo& info,
+    const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<InstanceNormalizationQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClInstanceNormalizationWorkload", 1, 1);
@@ -43,7 +44,8 @@
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    m_Layer.configure(&input,
+    m_Layer.configure(clCompileContext,
+                      &input,
                       &output,
                       descriptor.m_Parameters.m_Gamma,
                       descriptor.m_Parameters.m_Beta,
diff --git a/src/backends/cl/workloads/ClInstanceNormalizationWorkload.hpp b/src/backends/cl/workloads/ClInstanceNormalizationWorkload.hpp
index 0e37bdc..957ba34 100644
--- a/src/backends/cl/workloads/ClInstanceNormalizationWorkload.hpp
+++ b/src/backends/cl/workloads/ClInstanceNormalizationWorkload.hpp
@@ -19,7 +19,9 @@
 class ClInstanceNormalizationWorkload : public BaseWorkload<InstanceNormalizationQueueDescriptor>
 {
 public:
-    ClInstanceNormalizationWorkload(const InstanceNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClInstanceNormalizationWorkload(const InstanceNormalizationQueueDescriptor& descriptor,
+                                    const WorkloadInfo& info,
+                                    const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
index e1a6127..bd38219 100644
--- a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
@@ -27,7 +27,8 @@
 }
 
 ClL2NormalizationFloatWorkload::ClL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor,
-                                                               const WorkloadInfo& info)
+                                                               const WorkloadInfo& info,
+                                                               const arm_compute::CLCompileContext& clCompileContext)
     : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClL2NormalizationFloatWorkload", 1, 1);
@@ -41,7 +42,7 @@
 
     int axis = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 2 : 0;
 
-    m_Layer.configure(&input, &output, axis, m_Data.m_Parameters.m_Eps);
+    m_Layer.configure(clCompileContext, &input, &output, axis, m_Data.m_Parameters.m_Eps);
 }
 
 void ClL2NormalizationFloatWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
index 26aea9f..8648da4 100644
--- a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
@@ -19,7 +19,9 @@
 class ClL2NormalizationFloatWorkload : public FloatWorkload<L2NormalizationQueueDescriptor>
 {
 public:
-    ClL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor,
+                                   const WorkloadInfo& info,
+                                   const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClLogSoftmaxWorkload.cpp b/src/backends/cl/workloads/ClLogSoftmaxWorkload.cpp
index 6d53523..1a255f1 100644
--- a/src/backends/cl/workloads/ClLogSoftmaxWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogSoftmaxWorkload.cpp
@@ -25,8 +25,10 @@
     return arm_compute::CLLogSoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo, descriptor.m_Beta, aclAxis);
 }
 
-ClLogSoftmaxWorkload::ClLogSoftmaxWorkload(const LogSoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
-                                           std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ClLogSoftmaxWorkload::ClLogSoftmaxWorkload(const LogSoftmaxQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info,
+                                           std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+                                           const arm_compute::CLCompileContext& clCompileContext)
         : BaseWorkload<LogSoftmaxQueueDescriptor>(descriptor, info)
         , m_LogSoftmaxLayer(memoryManager)
 {
@@ -36,7 +38,7 @@
     arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
     int aclAxis = ComputeAclAxis(m_Data.m_Parameters.m_Axis, info.m_InputTensorInfos[0]);
-    m_LogSoftmaxLayer.configure(&input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
+    m_LogSoftmaxLayer.configure(clCompileContext, &input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
 }
 
 void ClLogSoftmaxWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogSoftmaxWorkload.hpp b/src/backends/cl/workloads/ClLogSoftmaxWorkload.hpp
index 9b531ad..a283588 100644
--- a/src/backends/cl/workloads/ClLogSoftmaxWorkload.hpp
+++ b/src/backends/cl/workloads/ClLogSoftmaxWorkload.hpp
@@ -25,7 +25,8 @@
 {
 public:
     ClLogSoftmaxWorkload(const LogSoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
-                         std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+                         std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+                         const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClLogicalAndWorkload.cpp b/src/backends/cl/workloads/ClLogicalAndWorkload.cpp
index 9418d73..f04cede 100644
--- a/src/backends/cl/workloads/ClLogicalAndWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogicalAndWorkload.cpp
@@ -32,7 +32,8 @@
 }
 
 ClLogicalAndWorkload::ClLogicalAndWorkload(const LogicalBinaryQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info)
+                                           const WorkloadInfo& info,
+                                           const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<LogicalBinaryQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClLogicalAndWorkload", 2, 1);
@@ -41,7 +42,7 @@
     arm_compute::ICLTensor& input1 = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_LogicalAndLayer.configure(&input0, &input1, &output);
+    m_LogicalAndLayer.configure(clCompileContext, &input0, &input1, &output);
 }
 
 void ClLogicalAndWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogicalAndWorkload.hpp b/src/backends/cl/workloads/ClLogicalAndWorkload.hpp
index 3bf6afe..c7cbf5a 100644
--- a/src/backends/cl/workloads/ClLogicalAndWorkload.hpp
+++ b/src/backends/cl/workloads/ClLogicalAndWorkload.hpp
@@ -20,7 +20,9 @@
 class ClLogicalAndWorkload : public BaseWorkload<LogicalBinaryQueueDescriptor>
 {
 public:
-    ClLogicalAndWorkload(const LogicalBinaryQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClLogicalAndWorkload(const LogicalBinaryQueueDescriptor& descriptor,
+                         const WorkloadInfo& info,
+                         const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClLogicalNotWorkload.cpp b/src/backends/cl/workloads/ClLogicalNotWorkload.cpp
index eb90caf..475e57f 100644
--- a/src/backends/cl/workloads/ClLogicalNotWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogicalNotWorkload.cpp
@@ -29,7 +29,8 @@
 }
 
 ClLogicalNotWorkload::ClLogicalNotWorkload(const ElementwiseUnaryQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info)
+                                           const WorkloadInfo& info,
+                                           const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<ElementwiseUnaryQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClLogicalNotWorkload", 1, 1);
@@ -37,7 +38,7 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_LogicalNotLayer.configure(&input, &output);
+    m_LogicalNotLayer.configure(clCompileContext, &input, &output);
 }
 
 void ClLogicalNotWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogicalNotWorkload.hpp b/src/backends/cl/workloads/ClLogicalNotWorkload.hpp
index f1225c7..9c6c3d1 100644
--- a/src/backends/cl/workloads/ClLogicalNotWorkload.hpp
+++ b/src/backends/cl/workloads/ClLogicalNotWorkload.hpp
@@ -18,7 +18,9 @@
 class ClLogicalNotWorkload : public BaseWorkload<ElementwiseUnaryQueueDescriptor>
 {
 public:
-    ClLogicalNotWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClLogicalNotWorkload(const ElementwiseUnaryQueueDescriptor& descriptor,
+                         const WorkloadInfo& info,
+                         const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClLogicalOrWorkload.cpp b/src/backends/cl/workloads/ClLogicalOrWorkload.cpp
index e9895bf..355310e 100644
--- a/src/backends/cl/workloads/ClLogicalOrWorkload.cpp
+++ b/src/backends/cl/workloads/ClLogicalOrWorkload.cpp
@@ -32,7 +32,8 @@
 }
 
 ClLogicalOrWorkload::ClLogicalOrWorkload(const LogicalBinaryQueueDescriptor& descriptor,
-                                         const WorkloadInfo& info)
+                                         const WorkloadInfo& info,
+                                         const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<LogicalBinaryQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClLogicalOrWorkload", 2, 1);
@@ -41,7 +42,7 @@
     arm_compute::ICLTensor& input1 = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_LogicalOrLayer.configure(&input0, &input1, &output);
+    m_LogicalOrLayer.configure(clCompileContext, &input0, &input1, &output);
 }
 
 void ClLogicalOrWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClLogicalOrWorkload.hpp b/src/backends/cl/workloads/ClLogicalOrWorkload.hpp
index 8faabde..a6db990 100644
--- a/src/backends/cl/workloads/ClLogicalOrWorkload.hpp
+++ b/src/backends/cl/workloads/ClLogicalOrWorkload.hpp
@@ -20,7 +20,9 @@
 class ClLogicalOrWorkload : public BaseWorkload<LogicalBinaryQueueDescriptor>
 {
 public:
-    ClLogicalOrWorkload(const LogicalBinaryQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClLogicalOrWorkload(const LogicalBinaryQueueDescriptor& descriptor,
+                        const WorkloadInfo& info,
+                        const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
index fe9b45e..58cc735 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
@@ -19,7 +19,9 @@
 {
 using namespace armcomputetensorutils;
 
-ClLstmFloatWorkload::ClLstmFloatWorkload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
+ClLstmFloatWorkload::ClLstmFloatWorkload(const LstmQueueDescriptor &descriptor,
+                                         const WorkloadInfo &info,
+                                         const arm_compute::CLCompileContext& clCompileContext)
         : FloatWorkload<LstmQueueDescriptor>(descriptor, info)
 {
     arm_compute::LSTMParams<arm_compute::ICLTensor> lstm_param;
@@ -185,11 +187,12 @@
         throw armnn::Exception("Wrong Type of Activation Function!");
     }
 
-    m_LstmLayer.configure(&input, m_InputToForgetWeightsTensor.get(), m_InputToCellWeightsTensor.get(),
-                          m_InputToOutputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(),
-                          m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(),
-                          m_ForgetGateBiasTensor.get(), m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(),
-                          &output_state_in, &cell_state_in, m_ScratchBuffer.get(), &output_state_out,
+    m_LstmLayer.configure(clCompileContext, &input, m_InputToForgetWeightsTensor.get(),
+                          m_InputToCellWeightsTensor.get(), m_InputToOutputWeightsTensor.get(),
+                          m_RecurrentToForgetWeightsTensor.get(), m_RecurrentToCellWeightsTensor.get(),
+                          m_RecurrentToOutputWeightsTensor.get(), m_ForgetGateBiasTensor.get(),
+                          m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(), &output_state_in,
+                          &cell_state_in, m_ScratchBuffer.get(), &output_state_out,
                           &cell_state_out, &output, lstm_param, activationLayerInfo,
                           cell_threshold, projection_threshold);
 
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.hpp b/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
index b7cb408..b084750 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
@@ -18,7 +18,9 @@
 class ClLstmFloatWorkload : public FloatWorkload<LstmQueueDescriptor>
 {
 public:
-    ClLstmFloatWorkload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClLstmFloatWorkload(const LstmQueueDescriptor& descriptor,
+                        const WorkloadInfo& info,
+                        const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClMaximumWorkload.cpp b/src/backends/cl/workloads/ClMaximumWorkload.cpp
index cd3192d..85bea47 100644
--- a/src/backends/cl/workloads/ClMaximumWorkload.cpp
+++ b/src/backends/cl/workloads/ClMaximumWorkload.cpp
@@ -37,7 +37,8 @@
 }
 
 ClMaximumWorkload::ClMaximumWorkload(const MaximumQueueDescriptor& descriptor,
-                                               const WorkloadInfo& info)
+                                     const WorkloadInfo& info,
+                                     const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<MaximumQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClMaximumWorkload", 2, 1);
@@ -46,7 +47,7 @@
     arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_MaximumLayer.configure(&input0, &input1, &output);
+    m_MaximumLayer.configure(clCompileContext, &input0, &input1, &output);
 }
 
 void ClMaximumWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClMaximumWorkload.hpp b/src/backends/cl/workloads/ClMaximumWorkload.hpp
index 18f67cd..f6681c7 100644
--- a/src/backends/cl/workloads/ClMaximumWorkload.hpp
+++ b/src/backends/cl/workloads/ClMaximumWorkload.hpp
@@ -19,7 +19,9 @@
 class ClMaximumWorkload : public BaseWorkload<MaximumQueueDescriptor>
 {
 public:
-    ClMaximumWorkload(const MaximumQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClMaximumWorkload(const MaximumQueueDescriptor& descriptor,
+                      const WorkloadInfo& info,
+                      const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClMeanWorkload.cpp b/src/backends/cl/workloads/ClMeanWorkload.cpp
index 470b6a8..4cc0f7c 100644
--- a/src/backends/cl/workloads/ClMeanWorkload.cpp
+++ b/src/backends/cl/workloads/ClMeanWorkload.cpp
@@ -28,7 +28,9 @@
     return arm_compute::CLReduceMean::validate(&aclInputInfo, coords, desc.m_KeepDims, &aclOutputInfo);
 }
 
-ClMeanWorkload::ClMeanWorkload(const MeanQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClMeanWorkload::ClMeanWorkload(const MeanQueueDescriptor& descriptor,
+                               const WorkloadInfo& info,
+                               const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<MeanQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClMeanWorkload", 1, 1);
@@ -40,7 +42,7 @@
                                                                           info.m_InputTensorInfos[0].GetNumDimensions(),
                                                                           m_Data.m_Parameters.m_Axis);
 
-    m_Layer.configure(&input, coords, m_Data.m_Parameters.m_KeepDims, &output);
+    m_Layer.configure(clCompileContext, &input, coords, m_Data.m_Parameters.m_KeepDims, &output);
 }
 
 void ClMeanWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClMeanWorkload.hpp b/src/backends/cl/workloads/ClMeanWorkload.hpp
index 127c054..04e9fe2 100644
--- a/src/backends/cl/workloads/ClMeanWorkload.hpp
+++ b/src/backends/cl/workloads/ClMeanWorkload.hpp
@@ -19,7 +19,9 @@
 class ClMeanWorkload : public BaseWorkload<MeanQueueDescriptor>
 {
 public:
-    ClMeanWorkload(const MeanQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClMeanWorkload(const MeanQueueDescriptor& descriptor,
+                   const WorkloadInfo& info,
+                   const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClMinimumWorkload.cpp b/src/backends/cl/workloads/ClMinimumWorkload.cpp
index 5f8dfdb..07a78b5 100644
--- a/src/backends/cl/workloads/ClMinimumWorkload.cpp
+++ b/src/backends/cl/workloads/ClMinimumWorkload.cpp
@@ -37,7 +37,8 @@
 }
 
 ClMinimumWorkload::ClMinimumWorkload(const MinimumQueueDescriptor& descriptor,
-                                     const WorkloadInfo& info)
+                                     const WorkloadInfo& info,
+                                     const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<MinimumQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClMinimumWorkload", 2, 1);
@@ -46,7 +47,7 @@
     arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_MinimumLayer.configure(&input0, &input1, &output);
+    m_MinimumLayer.configure(clCompileContext, &input0, &input1, &output);
 }
 
 void ClMinimumWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClMinimumWorkload.hpp b/src/backends/cl/workloads/ClMinimumWorkload.hpp
index 55d7eea..34e7bb8 100644
--- a/src/backends/cl/workloads/ClMinimumWorkload.hpp
+++ b/src/backends/cl/workloads/ClMinimumWorkload.hpp
@@ -19,7 +19,9 @@
 class ClMinimumWorkload : public BaseWorkload<MinimumQueueDescriptor>
 {
 public:
-    ClMinimumWorkload(const MinimumQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClMinimumWorkload(const MinimumQueueDescriptor& descriptor,
+                      const WorkloadInfo& info,
+                      const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClMultiplicationWorkload.cpp b/src/backends/cl/workloads/ClMultiplicationWorkload.cpp
index 46a1c4b..31e9d02 100644
--- a/src/backends/cl/workloads/ClMultiplicationWorkload.cpp
+++ b/src/backends/cl/workloads/ClMultiplicationWorkload.cpp
@@ -45,7 +45,8 @@
 
 
 ClMultiplicationWorkload::ClMultiplicationWorkload(const MultiplicationQueueDescriptor& descriptor,
-                                                   const WorkloadInfo& info)
+                                                   const WorkloadInfo& info,
+                                                   const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<MultiplicationQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClMultiplicationWorkload", 2, 1);
@@ -62,7 +63,8 @@
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
     // Construct
-    m_PixelWiseMultiplication.configure(&input0,
+    m_PixelWiseMultiplication.configure(clCompileContext,
+                                        &input0,
                                         &input1,
                                         &output,
                                         1.0f,
diff --git a/src/backends/cl/workloads/ClMultiplicationWorkload.hpp b/src/backends/cl/workloads/ClMultiplicationWorkload.hpp
index 461449c..424f3d7 100644
--- a/src/backends/cl/workloads/ClMultiplicationWorkload.hpp
+++ b/src/backends/cl/workloads/ClMultiplicationWorkload.hpp
@@ -20,7 +20,9 @@
 class ClMultiplicationWorkload : public BaseWorkload<MultiplicationQueueDescriptor>
 {
 public:
-    ClMultiplicationWorkload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClMultiplicationWorkload(const MultiplicationQueueDescriptor& descriptor,
+                             const WorkloadInfo& info,
+                             const arm_compute::CLCompileContext& clCompileContext);
 
     using BaseWorkload<MultiplicationQueueDescriptor>::BaseWorkload;
     void Execute() const override;
diff --git a/src/backends/cl/workloads/ClNegWorkload.cpp b/src/backends/cl/workloads/ClNegWorkload.cpp
index 2788662..7505ab6 100644
--- a/src/backends/cl/workloads/ClNegWorkload.cpp
+++ b/src/backends/cl/workloads/ClNegWorkload.cpp
@@ -23,7 +23,9 @@
     return arm_compute::CLNegLayer::validate(&aclInput, &aclOutput);
 }
 
-ClNegWorkload::ClNegWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClNegWorkload::ClNegWorkload(const ElementwiseUnaryQueueDescriptor& descriptor,
+                             const WorkloadInfo& info,
+                             const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<ElementwiseUnaryQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClNegWorkload", 1, 1);
@@ -31,7 +33,7 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_NegLayer.configure(&input, &output);
+    m_NegLayer.configure(clCompileContext, &input, &output);
 }
 
 void ClNegWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClNegWorkload.hpp b/src/backends/cl/workloads/ClNegWorkload.hpp
index 9dbfa07..7ee35cb 100644
--- a/src/backends/cl/workloads/ClNegWorkload.hpp
+++ b/src/backends/cl/workloads/ClNegWorkload.hpp
@@ -18,7 +18,9 @@
 class ClNegWorkload : public BaseWorkload<ElementwiseUnaryQueueDescriptor>
 {
 public:
-    ClNegWorkload(const ElementwiseUnaryQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClNegWorkload(const ElementwiseUnaryQueueDescriptor& descriptor,
+                  const WorkloadInfo& info,
+                  const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
index 5f2fd7a..290d29a 100644
--- a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
@@ -29,7 +29,8 @@
 }
 
 ClNormalizationFloatWorkload::ClNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor,
-                                                           const WorkloadInfo& info)
+                                                           const WorkloadInfo& info,
+                                                           const arm_compute::CLCompileContext& clCompileContext)
     : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClNormalizationFloatWorkload", 1, 1);
@@ -43,7 +44,7 @@
 
     arm_compute::NormalizationLayerInfo normalizationInfo = BuildArmComputeNormalizationLayerInfo(m_Data.m_Parameters);
 
-    m_NormalizationLayer.configure(&input, &output, normalizationInfo);
+    m_NormalizationLayer.configure(clCompileContext, &input, &output, normalizationInfo);
 };
 
 void ClNormalizationFloatWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
index a6d4f25..dd309b4 100644
--- a/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
@@ -19,7 +19,9 @@
 class ClNormalizationFloatWorkload : public FloatWorkload<NormalizationQueueDescriptor>
 {
 public:
-    ClNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor,
+                                 const WorkloadInfo& info,
+                                 const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClPadWorkload.cpp b/src/backends/cl/workloads/ClPadWorkload.cpp
index 8a8c34a..533855c 100644
--- a/src/backends/cl/workloads/ClPadWorkload.cpp
+++ b/src/backends/cl/workloads/ClPadWorkload.cpp
@@ -15,7 +15,9 @@
 {
 using namespace armcomputetensorutils;
 
-ClPadWorkload::ClPadWorkload(const PadQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClPadWorkload::ClPadWorkload(const PadQueueDescriptor& descriptor,
+                             const WorkloadInfo& info,
+                             const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<PadQueueDescriptor>(descriptor, info)
 {
     this->m_Data.ValidateInputsOutputs("ClPadWorkload", 1, 1);
@@ -33,7 +35,7 @@
 
     arm_compute::PixelValue pixelValue = GetPixelValue(input, descriptor.m_Parameters.m_PadValue);
 
-    m_Layer.configure(&input, &output, padList, pixelValue);
+    m_Layer.configure(clCompileContext, &input, &output, padList, pixelValue);
 }
 
 void ClPadWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClPadWorkload.hpp b/src/backends/cl/workloads/ClPadWorkload.hpp
index e87cbb7..ac9b4b7 100644
--- a/src/backends/cl/workloads/ClPadWorkload.hpp
+++ b/src/backends/cl/workloads/ClPadWorkload.hpp
@@ -14,7 +14,9 @@
 class ClPadWorkload : public BaseWorkload<PadQueueDescriptor>
 {
 public:
-    ClPadWorkload(const PadQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClPadWorkload(const PadQueueDescriptor& descriptor,
+                  const WorkloadInfo& info,
+                  const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClPermuteWorkload.cpp b/src/backends/cl/workloads/ClPermuteWorkload.cpp
index 41bce1d..5aadc76 100644
--- a/src/backends/cl/workloads/ClPermuteWorkload.cpp
+++ b/src/backends/cl/workloads/ClPermuteWorkload.cpp
@@ -27,7 +27,8 @@
 }
 
 ClPermuteWorkload::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor,
-                                     const WorkloadInfo& info)
+                                     const WorkloadInfo& info,
+                                     const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<PermuteQueueDescriptor>(descriptor, info)
 {
     using armcomputetensorutils::BuildArmComputePermutationVector;
@@ -39,7 +40,7 @@
     const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
 
     // Run the layer.
-    m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings));
+    m_PermuteFunction.configure(clCompileContext, &input, &output, BuildArmComputePermutationVector(mappings));
 }
 
 void ClPermuteWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClPermuteWorkload.hpp b/src/backends/cl/workloads/ClPermuteWorkload.hpp
index 8b5f4c6..2df2b26 100644
--- a/src/backends/cl/workloads/ClPermuteWorkload.hpp
+++ b/src/backends/cl/workloads/ClPermuteWorkload.hpp
@@ -29,7 +29,9 @@
         return name;
     }
 
-    ClPermuteWorkload(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClPermuteWorkload(const PermuteQueueDescriptor& descriptor,
+                      const WorkloadInfo& info,
+                      const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClPooling2dWorkload.cpp b/src/backends/cl/workloads/ClPooling2dWorkload.cpp
index 6838804..c7cc102 100644
--- a/src/backends/cl/workloads/ClPooling2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClPooling2dWorkload.cpp
@@ -28,7 +28,9 @@
 }
 
 ClPooling2dWorkload::ClPooling2dWorkload(
-    const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info)
+    const Pooling2dQueueDescriptor& descriptor,
+    const WorkloadInfo& info,
+    const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<Pooling2dQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClPooling2dWorkload", 1, 1);
@@ -48,7 +50,7 @@
     arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters, fpMixedPrecision);
 
     // Run the layer.
-    m_PoolingLayer.configure(&input, &output, layerInfo);
+    m_PoolingLayer.configure(clCompileContext, &input, &output, layerInfo);
 }
 
 void ClPooling2dWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClPooling2dWorkload.hpp b/src/backends/cl/workloads/ClPooling2dWorkload.hpp
index ce67db2..f4117ac 100644
--- a/src/backends/cl/workloads/ClPooling2dWorkload.hpp
+++ b/src/backends/cl/workloads/ClPooling2dWorkload.hpp
@@ -22,7 +22,8 @@
     using BaseWorkload<Pooling2dQueueDescriptor>::m_Data;
 
     ClPooling2dWorkload(const Pooling2dQueueDescriptor& descriptor,
-                        const WorkloadInfo& info);
+                        const WorkloadInfo& info,
+                        const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClPreluWorkload.cpp b/src/backends/cl/workloads/ClPreluWorkload.cpp
index 1813105..73fa887 100644
--- a/src/backends/cl/workloads/ClPreluWorkload.cpp
+++ b/src/backends/cl/workloads/ClPreluWorkload.cpp
@@ -27,7 +27,8 @@
 }
 
 ClPreluWorkload::ClPreluWorkload(const PreluQueueDescriptor& descriptor,
-                                 const WorkloadInfo& info)
+                                 const WorkloadInfo& info,
+                                 const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<PreluQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClPreluWorkload", 1, 1);
@@ -36,7 +37,7 @@
     arm_compute::ICLTensor& alpha = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_PreluLayer.configure(&input, &alpha, &output);
+    m_PreluLayer.configure(clCompileContext, &input, &alpha, &output);
 }
 
 void ClPreluWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClPreluWorkload.hpp b/src/backends/cl/workloads/ClPreluWorkload.hpp
index 9061416..ac8932c 100644
--- a/src/backends/cl/workloads/ClPreluWorkload.hpp
+++ b/src/backends/cl/workloads/ClPreluWorkload.hpp
@@ -18,7 +18,9 @@
 class ClPreluWorkload : public BaseWorkload<PreluQueueDescriptor>
 {
 public:
-    ClPreluWorkload(const PreluQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClPreluWorkload(const PreluQueueDescriptor& descriptor,
+                    const WorkloadInfo& info,
+                    const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClQLstmWorkload.cpp b/src/backends/cl/workloads/ClQLstmWorkload.cpp
index 7ece05f..0ae3715 100644
--- a/src/backends/cl/workloads/ClQLstmWorkload.cpp
+++ b/src/backends/cl/workloads/ClQLstmWorkload.cpp
@@ -14,7 +14,9 @@
 {
 using namespace armcomputetensorutils;
 
-ClQLstmWorkload::ClQLstmWorkload(const QLstmQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClQLstmWorkload::ClQLstmWorkload(const QLstmQueueDescriptor& descriptor,
+                                 const WorkloadInfo& info,
+                                 const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<QLstmQueueDescriptor>(descriptor, info)
 {
     arm_compute::LSTMParams<arm_compute::ICLTensor> qLstmParams;
@@ -150,8 +152,9 @@
                                         m_Data.m_Parameters.m_CellIntermediateScale,
                                         m_Data.m_Parameters.m_OutputIntermediateScale);
 
-    // QLSTM NEON configure
-    m_QLstmLayer.configure(&input,
+    // QLSTM CL configure
+    m_QLstmLayer.configure(clCompileContext,
+                           &input,
                            m_InputToForgetWeightsTensor.get(),
                            m_InputToCellWeightsTensor.get(),
                            m_InputToOutputWeightsTensor.get(),
diff --git a/src/backends/cl/workloads/ClQLstmWorkload.hpp b/src/backends/cl/workloads/ClQLstmWorkload.hpp
index f98c9b3..6758abc 100644
--- a/src/backends/cl/workloads/ClQLstmWorkload.hpp
+++ b/src/backends/cl/workloads/ClQLstmWorkload.hpp
@@ -19,7 +19,9 @@
 class ClQLstmWorkload : public BaseWorkload<QLstmQueueDescriptor>
 {
 public:
-    ClQLstmWorkload(const QLstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClQLstmWorkload(const QLstmQueueDescriptor& descriptor,
+                    const WorkloadInfo& info,
+                    const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClQuantizeWorkload.cpp b/src/backends/cl/workloads/ClQuantizeWorkload.cpp
index 263065a..5c945e0 100644
--- a/src/backends/cl/workloads/ClQuantizeWorkload.cpp
+++ b/src/backends/cl/workloads/ClQuantizeWorkload.cpp
@@ -29,7 +29,9 @@
                                                       &aclOutputInfo);
 }
 
-ClQuantizeWorkload::ClQuantizeWorkload(const QuantizeQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClQuantizeWorkload::ClQuantizeWorkload(const QuantizeQueueDescriptor& descriptor,
+                                       const WorkloadInfo& info,
+                                       const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<QuantizeQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClQuantizeWorkload", 1, 1);
@@ -37,7 +39,7 @@
     arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(&input, &output);
+    m_Layer.configure(clCompileContext, &input, &output);
 }
 
 void ClQuantizeWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClQuantizeWorkload.hpp b/src/backends/cl/workloads/ClQuantizeWorkload.hpp
index f4a7ec6..9bb1572 100644
--- a/src/backends/cl/workloads/ClQuantizeWorkload.hpp
+++ b/src/backends/cl/workloads/ClQuantizeWorkload.hpp
@@ -18,7 +18,9 @@
 class ClQuantizeWorkload : public BaseWorkload<QuantizeQueueDescriptor>
 {
 public:
-    ClQuantizeWorkload(const QuantizeQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClQuantizeWorkload(const QuantizeQueueDescriptor& descriptor,
+                       const WorkloadInfo& info,
+                       const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClQuantizedLstmWorkload.cpp b/src/backends/cl/workloads/ClQuantizedLstmWorkload.cpp
index 688ebf9..636bdec 100644
--- a/src/backends/cl/workloads/ClQuantizedLstmWorkload.cpp
+++ b/src/backends/cl/workloads/ClQuantizedLstmWorkload.cpp
@@ -62,7 +62,8 @@
 }
 
 ClQuantizedLstmWorkload::ClQuantizedLstmWorkload(const QuantizedLstmQueueDescriptor &descriptor,
-                                                 const WorkloadInfo &info):
+                                                 const WorkloadInfo &info,
+                                                 const arm_compute::CLCompileContext& clCompileContext):
                                                  BaseWorkload<QuantizedLstmQueueDescriptor>(descriptor, info)
 {
     m_InputToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
@@ -108,7 +109,8 @@
     arm_compute::ICLTensor& cellStateOutTensor        = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     arm_compute::ICLTensor& outputStateOutTensor      = static_cast<IClTensorHandle*>(m_Data.m_Outputs[1])->GetTensor();
 
-    m_QuantizedLstmLayer.configure(&inputTensor, m_InputToInputWeightsTensor.get(), m_InputToForgetWeightsTensor.get(),
+    m_QuantizedLstmLayer.configure(clCompileContext, &inputTensor, m_InputToInputWeightsTensor.get(),
+                                   m_InputToForgetWeightsTensor.get(),
                                    m_InputToCellWeightsTensor.get(), m_InputToOutputWeightsTensor.get(),
                                    m_RecurrentToInputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(),
                                    m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(),
diff --git a/src/backends/cl/workloads/ClQuantizedLstmWorkload.hpp b/src/backends/cl/workloads/ClQuantizedLstmWorkload.hpp
index 580db49..6561850 100644
--- a/src/backends/cl/workloads/ClQuantizedLstmWorkload.hpp
+++ b/src/backends/cl/workloads/ClQuantizedLstmWorkload.hpp
@@ -22,7 +22,9 @@
 class ClQuantizedLstmWorkload : public BaseWorkload<QuantizedLstmQueueDescriptor>
 {
 public:
-    ClQuantizedLstmWorkload(const QuantizedLstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClQuantizedLstmWorkload(const QuantizedLstmQueueDescriptor& descriptor,
+                            const WorkloadInfo& info,
+                            const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClReshapeWorkload.cpp b/src/backends/cl/workloads/ClReshapeWorkload.cpp
index d752290..0988bab 100644
--- a/src/backends/cl/workloads/ClReshapeWorkload.cpp
+++ b/src/backends/cl/workloads/ClReshapeWorkload.cpp
@@ -21,7 +21,9 @@
     return arm_compute::CLReshapeLayer::validate(&aclInputInfo, &aclOutputInfo);
 }
 
-ClReshapeWorkload::ClReshapeWorkload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClReshapeWorkload::ClReshapeWorkload(const ReshapeQueueDescriptor& descriptor,
+                                     const WorkloadInfo& info,
+                                     const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<ReshapeQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClReshapeWorkload", 1, 1);
@@ -29,7 +31,7 @@
     arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_Layer.configure(&input, &output);
+    m_Layer.configure(clCompileContext, &input, &output);
 }
 
 void ClReshapeWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClReshapeWorkload.hpp b/src/backends/cl/workloads/ClReshapeWorkload.hpp
index d836f1e..70d7287 100644
--- a/src/backends/cl/workloads/ClReshapeWorkload.hpp
+++ b/src/backends/cl/workloads/ClReshapeWorkload.hpp
@@ -18,7 +18,9 @@
 class ClReshapeWorkload : public BaseWorkload<ReshapeQueueDescriptor>
 {
 public:
-    ClReshapeWorkload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClReshapeWorkload(const ReshapeQueueDescriptor& descriptor,
+                      const WorkloadInfo& info,
+                      const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClResizeWorkload.cpp b/src/backends/cl/workloads/ClResizeWorkload.cpp
index 744a915..e477406 100644
--- a/src/backends/cl/workloads/ClResizeWorkload.cpp
+++ b/src/backends/cl/workloads/ClResizeWorkload.cpp
@@ -46,8 +46,10 @@
                                                                        descriptor.m_AlignCorners));
 }
 
-ClResizeWorkload::ClResizeWorkload(const ResizeQueueDescriptor& descriptor, const WorkloadInfo& info) :
-    BaseWorkload<ResizeQueueDescriptor>(descriptor, info)
+ClResizeWorkload::ClResizeWorkload(const ResizeQueueDescriptor& descriptor,
+                                   const WorkloadInfo& info,
+                                   const arm_compute::CLCompileContext& clCompileContext)
+  : BaseWorkload<ResizeQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClResizeWorkload", 1, 1);
 
@@ -65,7 +67,8 @@
                                                  ? arm_compute::SamplingPolicy::CENTER
                                                  : arm_compute::SamplingPolicy::TOP_LEFT;
 
-    m_ResizeLayer.configure(&input,
+    m_ResizeLayer.configure(clCompileContext,
+                            &input,
                             &output,
                             arm_compute::ScaleKernelInfo(aclInterpolationPolicy,
                                                          arm_compute::BorderMode::REPLICATE,
diff --git a/src/backends/cl/workloads/ClResizeWorkload.hpp b/src/backends/cl/workloads/ClResizeWorkload.hpp
index ab5b943..9549a32 100644
--- a/src/backends/cl/workloads/ClResizeWorkload.hpp
+++ b/src/backends/cl/workloads/ClResizeWorkload.hpp
@@ -19,7 +19,9 @@
 class ClResizeWorkload : public BaseWorkload<ResizeQueueDescriptor>
 {
 public:
-    ClResizeWorkload(const ResizeQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClResizeWorkload(const ResizeQueueDescriptor& descriptor,
+                     const WorkloadInfo& info,
+                     const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClRsqrtWorkload.cpp b/src/backends/cl/workloads/ClRsqrtWorkload.cpp
index 48fd1e0..a3a04c1 100644
--- a/src/backends/cl/workloads/ClRsqrtWorkload.cpp
+++ b/src/backends/cl/workloads/ClRsqrtWorkload.cpp
@@ -23,7 +23,9 @@
     return arm_compute::CLRsqrtLayer::validate(&aclInput, &aclOutput);
 }
 
-ClRsqrtWorkload::ClRsqrtWorkload(const RsqrtQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClRsqrtWorkload::ClRsqrtWorkload(const RsqrtQueueDescriptor& descriptor,
+                                 const WorkloadInfo& info,
+                                 const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<RsqrtQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClRsqrtWorkload", 1, 1);
@@ -31,7 +33,7 @@
     arm_compute::ICLTensor& input  = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = PolymorphicDowncast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
-    m_RsqrtLayer.configure(&input, &output);
+    m_RsqrtLayer.configure(clCompileContext, &input, &output);
 }
 
 void ClRsqrtWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClRsqrtWorkload.hpp b/src/backends/cl/workloads/ClRsqrtWorkload.hpp
index 8fb6229..35f8414 100644
--- a/src/backends/cl/workloads/ClRsqrtWorkload.hpp
+++ b/src/backends/cl/workloads/ClRsqrtWorkload.hpp
@@ -18,7 +18,9 @@
 class ClRsqrtWorkload : public BaseWorkload<RsqrtQueueDescriptor>
 {
 public:
-    ClRsqrtWorkload(const RsqrtQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClRsqrtWorkload(const RsqrtQueueDescriptor& descriptor,
+                    const WorkloadInfo& info,
+                    const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClSliceWorkload.cpp b/src/backends/cl/workloads/ClSliceWorkload.cpp
index d7b1dbb..1627196 100644
--- a/src/backends/cl/workloads/ClSliceWorkload.cpp
+++ b/src/backends/cl/workloads/ClSliceWorkload.cpp
@@ -30,7 +30,9 @@
     return arm_compute::CLSlice::validate(&aclInput, &aclOutput, starts, ends);
 }
 
-ClSliceWorkload::ClSliceWorkload(const SliceQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClSliceWorkload::ClSliceWorkload(const SliceQueueDescriptor& descriptor,
+                                 const WorkloadInfo& info,
+                                 const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<SliceQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClSliceWorkload", 1, 1);
@@ -43,7 +45,7 @@
 
     std::tie(starts, ends) = SetClSliceData(m_Data.m_Parameters.m_Begin, m_Data.m_Parameters.m_Size);
 
-    m_SliceFunction.configure(&input, &output, starts, ends);
+    m_SliceFunction.configure(clCompileContext, &input, &output, starts, ends);
 }
 
 void ClSliceWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSliceWorkload.hpp b/src/backends/cl/workloads/ClSliceWorkload.hpp
index 3460b77..67836c2 100644
--- a/src/backends/cl/workloads/ClSliceWorkload.hpp
+++ b/src/backends/cl/workloads/ClSliceWorkload.hpp
@@ -20,7 +20,9 @@
 class ClSliceWorkload : public BaseWorkload<SliceQueueDescriptor>
 {
 public:
-    ClSliceWorkload(const SliceQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClSliceWorkload(const SliceQueueDescriptor& descriptor,
+                    const WorkloadInfo& info,
+                    const arm_compute::CLCompileContext& clCompileContext);
     virtual void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClSoftmaxWorkload.cpp b/src/backends/cl/workloads/ClSoftmaxWorkload.cpp
index 8bc2a76..4547c68 100644
--- a/src/backends/cl/workloads/ClSoftmaxWorkload.cpp
+++ b/src/backends/cl/workloads/ClSoftmaxWorkload.cpp
@@ -25,8 +25,10 @@
     return arm_compute::CLSoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo, descriptor.m_Beta, aclAxis);
 }
 
-ClSoftmaxWorkload::ClSoftmaxWorkload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
-                                     std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ClSoftmaxWorkload::ClSoftmaxWorkload(const SoftmaxQueueDescriptor& descriptor,
+                                     const WorkloadInfo& info,
+                                     std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+                                     const arm_compute::CLCompileContext& clCompileContext)
         : BaseWorkload<SoftmaxQueueDescriptor>(descriptor, info)
         , m_SoftmaxLayer(memoryManager)
 {
@@ -36,7 +38,7 @@
     arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
     int aclAxis = ComputeAclAxis(m_Data.m_Parameters.m_Axis, info.m_InputTensorInfos[0]);
-    m_SoftmaxLayer.configure(&input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
+    m_SoftmaxLayer.configure(clCompileContext, &input, &output, m_Data.m_Parameters.m_Beta, aclAxis);
 }
 
 void ClSoftmaxWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSoftmaxWorkload.hpp b/src/backends/cl/workloads/ClSoftmaxWorkload.hpp
index 158bf46..1742c60 100644
--- a/src/backends/cl/workloads/ClSoftmaxWorkload.hpp
+++ b/src/backends/cl/workloads/ClSoftmaxWorkload.hpp
@@ -23,8 +23,10 @@
 class ClSoftmaxWorkload : public BaseWorkload<SoftmaxQueueDescriptor>
 {
 public:
-    ClSoftmaxWorkload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
-                      std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+    ClSoftmaxWorkload(const SoftmaxQueueDescriptor& descriptor,
+                      const WorkloadInfo& info,
+                      std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+                      const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.cpp b/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.cpp
index 443c56b..7b29cde 100644
--- a/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.cpp
+++ b/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.cpp
@@ -45,7 +45,9 @@
 }
 
 ClSpaceToBatchNdWorkload::ClSpaceToBatchNdWorkload(
-    const SpaceToBatchNdQueueDescriptor& descriptor, const WorkloadInfo& info)
+    const SpaceToBatchNdQueueDescriptor& descriptor,
+    const WorkloadInfo& info,
+    const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<SpaceToBatchNdQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClSpaceToBatchNdWorkload", 1, 1);
@@ -68,7 +70,8 @@
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    m_SpaceToBatchLayer.configure(&input,
+    m_SpaceToBatchLayer.configure(clCompileContext,
+                                  &input,
                                   blockWidth,
                                   blockHeight,
                                   paddingLeftTop,
diff --git a/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.hpp b/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.hpp
index 7500b5a..06d243a 100644
--- a/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.hpp
+++ b/src/backends/cl/workloads/ClSpaceToBatchNdWorkload.hpp
@@ -22,7 +22,9 @@
 class ClSpaceToBatchNdWorkload : public BaseWorkload<SpaceToBatchNdQueueDescriptor>
 {
 public:
-    ClSpaceToBatchNdWorkload(const SpaceToBatchNdQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClSpaceToBatchNdWorkload(const SpaceToBatchNdQueueDescriptor& descriptor,
+                             const WorkloadInfo& info,
+                             const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClSpaceToDepthWorkload.cpp b/src/backends/cl/workloads/ClSpaceToDepthWorkload.cpp
index f35fe0e..7a590d2 100644
--- a/src/backends/cl/workloads/ClSpaceToDepthWorkload.cpp
+++ b/src/backends/cl/workloads/ClSpaceToDepthWorkload.cpp
@@ -18,7 +18,8 @@
 using namespace armcomputetensorutils;
 
 ClSpaceToDepthWorkload::ClSpaceToDepthWorkload(const SpaceToDepthQueueDescriptor& desc,
-                                               const WorkloadInfo& info)
+                                               const WorkloadInfo& info,
+                                               const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<SpaceToDepthQueueDescriptor>(desc, info)
 {
     m_Data.ValidateInputsOutputs("ClSpaceToDepthWorkload", 1, 1);
@@ -33,7 +34,7 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     output.info()->set_data_layout(aclDataLayout);
 
-    m_Layer.configure(&input, &output, blockSize);
+    m_Layer.configure(clCompileContext, &input, &output, blockSize);
 }
 
 void ClSpaceToDepthWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSpaceToDepthWorkload.hpp b/src/backends/cl/workloads/ClSpaceToDepthWorkload.hpp
index 57ce5d4..b782bbe 100644
--- a/src/backends/cl/workloads/ClSpaceToDepthWorkload.hpp
+++ b/src/backends/cl/workloads/ClSpaceToDepthWorkload.hpp
@@ -19,7 +19,9 @@
 class ClSpaceToDepthWorkload : public BaseWorkload<SpaceToDepthQueueDescriptor>
 {
 public:
-    ClSpaceToDepthWorkload(const SpaceToDepthQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClSpaceToDepthWorkload(const SpaceToDepthQueueDescriptor& descriptor,
+                           const WorkloadInfo& info,
+                           const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClSplitterWorkload.cpp b/src/backends/cl/workloads/ClSplitterWorkload.cpp
index 045fbb7..70a8178 100644
--- a/src/backends/cl/workloads/ClSplitterWorkload.cpp
+++ b/src/backends/cl/workloads/ClSplitterWorkload.cpp
@@ -9,7 +9,6 @@
 
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <aclCommon/ArmComputeUtils.hpp>
-#include <arm_compute/runtime/CL/functions/CLSplit.h>
 #include <armnn/utility/PolymorphicDowncast.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 #include <cl/ClTensorHandle.hpp>
@@ -53,7 +52,9 @@
     return arm_compute::CLSplit::validate(&aclInputInfo, aclOutputPtr, aclAxis);
 }
 
-ClSplitterWorkload::ClSplitterWorkload(const SplitterQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClSplitterWorkload::ClSplitterWorkload(const SplitterQueueDescriptor& descriptor,
+                                       const WorkloadInfo& info,
+                                       const arm_compute::CLCompileContext&)
         : BaseWorkload<SplitterQueueDescriptor>(descriptor, info)
 {
     bool allOutputsAreSubtensors = true;
diff --git a/src/backends/cl/workloads/ClSplitterWorkload.hpp b/src/backends/cl/workloads/ClSplitterWorkload.hpp
index 82211f5..c59aa02 100644
--- a/src/backends/cl/workloads/ClSplitterWorkload.hpp
+++ b/src/backends/cl/workloads/ClSplitterWorkload.hpp
@@ -9,6 +9,7 @@
 
 #include <arm_compute/core/Error.h>
 #include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/CL/functions/CLSplit.h>
 
 #include <functional>
 
@@ -22,7 +23,9 @@
 class ClSplitterWorkload : public BaseWorkload<SplitterQueueDescriptor>
 {
 public:
-    ClSplitterWorkload(const SplitterQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClSplitterWorkload(const SplitterQueueDescriptor& descriptor,
+                       const WorkloadInfo& info,
+                       const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClStackWorkload.cpp b/src/backends/cl/workloads/ClStackWorkload.cpp
index c0b88b1..749282f 100644
--- a/src/backends/cl/workloads/ClStackWorkload.cpp
+++ b/src/backends/cl/workloads/ClStackWorkload.cpp
@@ -44,7 +44,9 @@
     return arm_compute::CLStackLayer::validate(aclInputPtrs, aclAxis, &aclOutputInfo);
 }
 
-ClStackWorkload::ClStackWorkload(const StackQueueDescriptor& descriptor, const WorkloadInfo& info)
+ClStackWorkload::ClStackWorkload(const StackQueueDescriptor& descriptor,
+                                 const WorkloadInfo& info,
+                                 const arm_compute::CLCompileContext& clCompileContext)
 : BaseWorkload<StackQueueDescriptor>(descriptor, info)
 {
     std::vector<arm_compute::ICLTensor*> aclInputs;
@@ -58,7 +60,7 @@
 
     m_Layer.reset(new arm_compute::CLStackLayer());
     int aclAxis = CalcAxis(descriptor.m_Parameters.m_Axis, descriptor.m_Parameters.m_InputShape.GetNumDimensions());
-    m_Layer->configure(aclInputs, aclAxis, &output);
+    m_Layer->configure(clCompileContext, aclInputs, aclAxis, &output);
 }
 
 void ClStackWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClStackWorkload.hpp b/src/backends/cl/workloads/ClStackWorkload.hpp
index f27d6cd..3f1e642 100644
--- a/src/backends/cl/workloads/ClStackWorkload.hpp
+++ b/src/backends/cl/workloads/ClStackWorkload.hpp
@@ -18,7 +18,9 @@
 class ClStackWorkload : public BaseWorkload<StackQueueDescriptor>
 {
 public:
-    ClStackWorkload(const StackQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClStackWorkload(const StackQueueDescriptor& descriptor,
+                    const WorkloadInfo& info,
+                    const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClStridedSliceWorkload.cpp b/src/backends/cl/workloads/ClStridedSliceWorkload.cpp
index b094a91..92e860f 100644
--- a/src/backends/cl/workloads/ClStridedSliceWorkload.cpp
+++ b/src/backends/cl/workloads/ClStridedSliceWorkload.cpp
@@ -53,7 +53,8 @@
 }
 
 ClStridedSliceWorkload::ClStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor,
-                                               const WorkloadInfo& info)
+                                               const WorkloadInfo& info,
+                                               const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<StridedSliceQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs("ClStridedSliceWorkload", 1, 1);
@@ -78,7 +79,8 @@
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    m_StridedSliceLayer.configure(&input,
+    m_StridedSliceLayer.configure(clCompileContext,
+                                  &input,
                                   &output,
                                   starts,
                                   ends,
diff --git a/src/backends/cl/workloads/ClStridedSliceWorkload.hpp b/src/backends/cl/workloads/ClStridedSliceWorkload.hpp
index bce3fe1..1229599 100644
--- a/src/backends/cl/workloads/ClStridedSliceWorkload.hpp
+++ b/src/backends/cl/workloads/ClStridedSliceWorkload.hpp
@@ -22,7 +22,9 @@
 class ClStridedSliceWorkload : public BaseWorkload<StridedSliceQueueDescriptor>
 {
 public:
-    ClStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor,
+                           const WorkloadInfo& info,
+                           const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private:
diff --git a/src/backends/cl/workloads/ClSubtractionWorkload.cpp b/src/backends/cl/workloads/ClSubtractionWorkload.cpp
index 865dceb..31e0bec 100644
--- a/src/backends/cl/workloads/ClSubtractionWorkload.cpp
+++ b/src/backends/cl/workloads/ClSubtractionWorkload.cpp
@@ -19,7 +19,8 @@
 static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
 
 ClSubtractionWorkload::ClSubtractionWorkload(const SubtractionQueueDescriptor& descriptor,
-                                             const WorkloadInfo& info)
+                                             const WorkloadInfo& info,
+                                             const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<SubtractionQueueDescriptor>(descriptor, info)
 {
     this->m_Data.ValidateInputsOutputs("ClSubtractionWorkload", 2, 1);
@@ -30,7 +31,7 @@
 
     const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
 
-    m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy, activationInfo);
+    m_Layer.configure(clCompileContext, &input0, &input1, &output, g_AclConvertPolicy, activationInfo);
 }
 
 void ClSubtractionWorkload::Execute() const
diff --git a/src/backends/cl/workloads/ClSubtractionWorkload.hpp b/src/backends/cl/workloads/ClSubtractionWorkload.hpp
index 9f51de6..28440b0 100644
--- a/src/backends/cl/workloads/ClSubtractionWorkload.hpp
+++ b/src/backends/cl/workloads/ClSubtractionWorkload.hpp
@@ -15,7 +15,9 @@
 class ClSubtractionWorkload : public BaseWorkload<SubtractionQueueDescriptor>
 {
 public:
-    ClSubtractionWorkload(const SubtractionQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClSubtractionWorkload(const SubtractionQueueDescriptor& descriptor,
+                          const WorkloadInfo& info,
+                          const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.cpp
index 20b2104..ff0fd5c 100644
--- a/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.cpp
@@ -56,7 +56,8 @@
 ClTransposeConvolution2dWorkload::ClTransposeConvolution2dWorkload(
     const TransposeConvolution2dQueueDescriptor& descriptor,
     const WorkloadInfo& info,
-    std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) :
+    std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+    const arm_compute::CLCompileContext& clCompileContext) :
     BaseWorkload<TransposeConvolution2dQueueDescriptor>(descriptor, info),
     m_Layer(memoryManager)
 {
@@ -82,7 +83,7 @@
     output.info()->set_data_layout(aclDataLayout);
 
     arm_compute::PadStrideInfo padStrideInfo = BuildArmComputePadStrideInfo(m_Data.m_Parameters);
-    m_Layer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, padStrideInfo);
+    m_Layer.configure(clCompileContext, &input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, padStrideInfo);
 
     InitializeArmComputeClTensorData(*m_WeightsTensor, m_Data.m_Weight);
     if (m_BiasesTensor)
diff --git a/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.hpp
index b7320bf..8a24e6d 100644
--- a/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.hpp
+++ b/src/backends/cl/workloads/ClTransposeConvolution2dWorkload.hpp
@@ -29,7 +29,8 @@
 public:
     ClTransposeConvolution2dWorkload(const TransposeConvolution2dQueueDescriptor& descriptor,
                                      const WorkloadInfo& info,
-                                     std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+                                     std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager,
+                                     const arm_compute::CLCompileContext& clCompileContext);
 
     void Execute() const override;
 
diff --git a/src/backends/cl/workloads/ClTransposeWorkload.cpp b/src/backends/cl/workloads/ClTransposeWorkload.cpp
index b276b22..7ef502e 100644
--- a/src/backends/cl/workloads/ClTransposeWorkload.cpp
+++ b/src/backends/cl/workloads/ClTransposeWorkload.cpp
@@ -27,7 +27,8 @@
 }
 
 ClTransposeWorkload::ClTransposeWorkload(const TransposeQueueDescriptor& descriptor,
-                                         const WorkloadInfo& info)
+                                         const WorkloadInfo& info,
+                                         const arm_compute::CLCompileContext& clCompileContext)
     : BaseWorkload<TransposeQueueDescriptor>(descriptor, info)
 {
     m_Data.ValidateInputsOutputs(GetName(), 1, 1);
@@ -36,7 +37,9 @@
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
     const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
     // Run the layer.
-    m_PermuteFunction.configure(&input, &output,
+    m_PermuteFunction.configure(clCompileContext,
+                                &input,
+                                &output,
                                 armcomputetensorutils::BuildArmComputeTransposeVector(mappings));
 }
 
diff --git a/src/backends/cl/workloads/ClTransposeWorkload.hpp b/src/backends/cl/workloads/ClTransposeWorkload.hpp
index c1bed93..4677bdc 100644
--- a/src/backends/cl/workloads/ClTransposeWorkload.hpp
+++ b/src/backends/cl/workloads/ClTransposeWorkload.hpp
@@ -29,7 +29,9 @@
         return name;
     }
 
-    ClTransposeWorkload(const TransposeQueueDescriptor& descriptor, const WorkloadInfo& info);
+    ClTransposeWorkload(const TransposeQueueDescriptor& descriptor,
+                        const WorkloadInfo& info,
+                        const arm_compute::CLCompileContext& clCompileContext);
     void Execute() const override;
 
 private: