IVGCVSW-3890 Add NEON INSTANCE_NORMALIZATION Workload

Signed-off-by: Sadik Armagan <sadik.armagan@arm.com>
Change-Id: Ia3e55da6e6a7b9d46544466897e3b1635c90c297
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index 2f3643f..270cb62 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -28,6 +28,7 @@
 #include "workloads/NeonDepthwiseConvolutionWorkload.hpp"
 #include "workloads/NeonDequantizeWorkload.hpp"
 #include "workloads/NeonGreaterWorkload.hpp"
+#include "workloads/NeonInstanceNormalizationWorkload.hpp"
 #include "workloads/NeonL2NormalizationFloatWorkload.hpp"
 #include "workloads/NeonLstmFloatWorkload.hpp"
 #include "workloads/NeonMaximumWorkload.hpp"
@@ -366,6 +367,18 @@
     return IsNeonBackendSupported(reasonIfUnsupported);
 }
 
+bool NeonLayerSupport::IsInstanceNormalizationSupported(const TensorInfo& input,
+                                                        const TensorInfo& output,
+                                                        const InstanceNormalizationDescriptor& descriptor,
+                                                        Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(NeonInstanceNormalizationWorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input,
+                                   output,
+                                   descriptor);
+}
+
 bool NeonLayerSupport::IsL2NormalizationSupported(const TensorInfo& input,
                                                   const TensorInfo& output,
                                                   const L2NormalizationDescriptor& descriptor,
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index 76eb342..d6a24ad 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -106,6 +106,11 @@
     bool IsInputSupported(const TensorInfo& input,
                           Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsInstanceNormalizationSupported(const TensorInfo& input,
+                                          const TensorInfo& output,
+                                          const InstanceNormalizationDescriptor& descriptor,
+                                          Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsL2NormalizationSupported(const TensorInfo& input,
                                     const TensorInfo& output,
                                     const L2NormalizationDescriptor& descriptor,
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index 3492923..5bd8f29 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -95,18 +95,6 @@
     return tensorHandle;
 }
 
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
-                                                            const WorkloadInfo&        info) const
-{
-    return std::make_unique<CopyMemGenericWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
-                                                             const WorkloadInfo&        info) const
-{
-    return std::make_unique<CopyMemGenericWorkload>(descriptor, info);
-}
-
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateAbs(const AbsQueueDescriptor& descriptor,
                                                           const WorkloadInfo& info) const
 {
@@ -119,48 +107,54 @@
     return std::make_unique<NeonActivationWorkload>(descriptor, info);
 }
 
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
-                                                              const WorkloadInfo&           info) const
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
+                                                                      const WorkloadInfo&            info) const
 {
-    return MakeWorkloadHelper<NeonSoftmaxFloatWorkload, NeonSoftmaxUint8Workload>(descriptor, info,
-        m_MemoryManager->GetIntraLayerManager());
+    return std::make_unique<NeonAdditionWorkload>(descriptor, info);
 }
 
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
-                                                               const WorkloadInfo&            info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateArgMinMax(const ArgMinMaxQueueDescriptor& descriptor,
+                                                                const WorkloadInfo& info) const
 {
-    return std::make_unique<NeonSplitterWorkload>(descriptor, info);
+    return std::make_unique<NeonArgMinMaxWorkload>(descriptor, info);
 }
 
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMerger(const MergerQueueDescriptor& descriptor,
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateBatchNormalization(
+    const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonBatchNormalizationWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateBatchToSpaceNd(const BatchToSpaceNdQueueDescriptor& descriptor,
+                                                                     const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateConcat(const ConcatQueueDescriptor& descriptor,
                                                                     const WorkloadInfo&          info) const
 {
-    return CreateConcat(descriptor, info);
+    return std::make_unique<NeonConcatWorkload>(descriptor, info);
 }
 
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateFullyConnected(
-    const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor,
+                                                               const WorkloadInfo& info) const
 {
-    return MakeWorkloadHelper<NeonFullyConnectedWorkload, NeonFullyConnectedWorkload>(descriptor, info,
-        m_MemoryManager->GetIntraLayerManager());
+    return std::make_unique<NeonConstantWorkload>(descriptor, info);
 }
 
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
-                                                                     const WorkloadInfo&           info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
+    const ConvertFp16ToFp32QueueDescriptor& descriptor,
+    const WorkloadInfo& info) const
 {
-    return std::make_unique<NeonPermuteWorkload>(descriptor, info);
+    return std::make_unique<NeonConvertFp16ToFp32Workload>(descriptor, info);
 }
 
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
-                                                                       const WorkloadInfo&           info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
+    const ConvertFp32ToFp16QueueDescriptor& descriptor,
+    const WorkloadInfo& info) const
 {
-    return std::make_unique<NeonPooling2dWorkload>(descriptor, info);
-}
-
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePrelu(const armnn::PreluQueueDescriptor &descriptor,
-                                                                   const armnn::WorkloadInfo &info) const
-{
-    return std::make_unique<NeonPreluWorkload>(descriptor, info);
+    return std::make_unique<NeonConvertFp32ToFp16Workload>(descriptor, info);
 }
 
 std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateConvolution2d(
@@ -170,6 +164,12 @@
                                                        m_MemoryManager->GetIntraLayerManager());
 }
 
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDebug(const DebugQueueDescriptor& descriptor,
+                                                            const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDepthToSpace(const DepthToSpaceQueueDescriptor& descriptor,
                                                                    const WorkloadInfo& info) const
 {
@@ -194,41 +194,86 @@
     return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
 }
 
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateNormalization(
-    const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
-{
-    return MakeWorkloadHelper<NeonNormalizationFloatWorkload, NullWorkload>(descriptor, info,
-        m_MemoryManager->GetIntraLayerManager());
-}
-
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
-                                                                      const WorkloadInfo&            info) const
-{
-    return std::make_unique<NeonAdditionWorkload>(descriptor, info);
-}
-
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMultiplication(
-    const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonMultiplicationWorkload>(descriptor, info);
-}
-
 std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateDivision(
     const DivisionQueueDescriptor& descriptor, const WorkloadInfo& info) const
 {
     return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
 }
 
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateSubtraction(
-    const SubtractionQueueDescriptor& descriptor, const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateEqual(const EqualQueueDescriptor& descriptor,
+                                                            const WorkloadInfo& info) const
 {
-    return std::make_unique<NeonSubtractionWorkload>(descriptor, info);
+    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
 }
 
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateBatchNormalization(
-    const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFakeQuantization(
+    const FakeQuantizationQueueDescriptor& descriptor,
+    const WorkloadInfo& info) const
 {
-    return std::make_unique<NeonBatchNormalizationWorkload>(descriptor, info);
+    return nullptr;
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
+                                                            const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NeonFloorFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateFullyConnected(
+    const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NeonFullyConnectedWorkload, NeonFullyConnectedWorkload>(
+        descriptor, info, m_MemoryManager->GetIntraLayerManager());
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateGather(const armnn::GatherQueueDescriptor& descriptor,
+                                                             const armnn::WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateGreater(const GreaterQueueDescriptor& descriptor,
+                                                              const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NeonGreaterFloat32Workload, NeonGreaterUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
+                                                            const WorkloadInfo&        info) const
+{
+    return std::make_unique<CopyMemGenericWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateInstanceNormalization(
+    const InstanceNormalizationQueueDescriptor& descriptor,
+    const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonInstanceNormalizationWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
+                                                                      const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NeonL2NormalizationFloatWorkload, NullWorkload>(descriptor, info,
+                                                                              m_MemoryManager->GetIntraLayerManager());
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+                                                           const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NeonLstmFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMaximum(const MaximumQueueDescriptor& descriptor,
+                                                              const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonMaximumWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMean(const MeanQueueDescriptor& descriptor,
+                                                           const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonMeanWorkload>(descriptor, info);
 }
 
 std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
@@ -253,6 +298,85 @@
     return std::make_unique<ImportMemGenericWorkload>(descriptor, info);
 }
 
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMerger(const MergerQueueDescriptor& descriptor,
+                                                                    const WorkloadInfo&          info) const
+{
+    return CreateConcat(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMinimum(const MinimumQueueDescriptor& descriptor,
+                                                              const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonMinimumWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMultiplication(
+    const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonMultiplicationWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateNormalization(
+    const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NeonNormalizationFloatWorkload, NullWorkload>(descriptor, info,
+                                                                            m_MemoryManager->GetIntraLayerManager());
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
+                                                             const WorkloadInfo&        info) const
+{
+    return std::make_unique<CopyMemGenericWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreatePad(const PadQueueDescriptor& descriptor,
+                                                          const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonPadWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
+                                                                     const WorkloadInfo&           info) const
+{
+    return std::make_unique<NeonPermuteWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+                                                                       const WorkloadInfo&           info) const
+{
+    return std::make_unique<NeonPooling2dWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreatePreCompiled(const PreCompiledQueueDescriptor& descriptor,
+                                                                  const WorkloadInfo& info) const
+{
+    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePrelu(const armnn::PreluQueueDescriptor &descriptor,
+                                                                   const armnn::WorkloadInfo &info) const
+{
+    return std::make_unique<NeonPreluWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateQuantize(const QuantizeQueueDescriptor& descriptor,
+                                                                      const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonQuantizeWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateQuantizedLstm(const QuantizedLstmQueueDescriptor& descriptor,
+                                                                    const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonQuantizedLstmWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
+                                                              const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonReshapeWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateResize(const ResizeQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info) const
 {
@@ -274,166 +398,35 @@
     return CreateResize(resizeDescriptor, info);
 }
 
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFakeQuantization(
-    const FakeQuantizationQueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return nullptr;
-}
-
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateQuantize(const QuantizeQueueDescriptor& descriptor,
-                                                                      const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonQuantizeWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return MakeWorkloadHelper<NeonL2NormalizationFloatWorkload, NullWorkload>(descriptor, info,
-        m_MemoryManager->GetIntraLayerManager());
-}
-
-std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateConcat(const ConcatQueueDescriptor& descriptor,
-                                                                    const WorkloadInfo&          info) const
-{
-    return std::make_unique<NeonConcatWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonConstantWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonReshapeWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSpaceToBatchNd(const SpaceToBatchNdQueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return nullptr;
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSpaceToDepth(const armnn::SpaceToDepthQueueDescriptor& descriptor,
-                                                                  const armnn::WorkloadInfo& info) const
-{
-    return std::make_unique<NeonSpaceToDepthWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return MakeWorkloadHelper<NeonFloorFloatWorkload, NullWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return MakeWorkloadHelper<NeonLstmFloatWorkload, NullWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateQuantizedLstm(const QuantizedLstmQueueDescriptor& descriptor,
-                                                                    const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonQuantizedLstmWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
-    const ConvertFp16ToFp32QueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonConvertFp16ToFp32Workload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
-    const ConvertFp32ToFp16QueueDescriptor& descriptor,
-    const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonConvertFp32ToFp16Workload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMaximum(const MaximumQueueDescriptor& descriptor,
-                                                              const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonMaximumWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMean(const MeanQueueDescriptor& descriptor,
-                                                           const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonMeanWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreatePad(const PadQueueDescriptor& descriptor,
-                                                          const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonPadWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateEqual(const EqualQueueDescriptor& descriptor,
-                                                               const WorkloadInfo& info) const
-{
-    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateBatchToSpaceNd(const BatchToSpaceNdQueueDescriptor& descriptor,
-                                                                     const WorkloadInfo& info) const
-{
-    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateStridedSlice(const StridedSliceQueueDescriptor& descriptor,
-                                                                   const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonStridedSliceWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateMinimum(const MinimumQueueDescriptor& descriptor,
-                                                              const WorkloadInfo& info) const
-{
-    return std::make_unique<NeonMinimumWorkload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateGreater(const GreaterQueueDescriptor& descriptor,
-                                                              const WorkloadInfo& info) const
-{
-    return MakeWorkloadHelper<NeonGreaterFloat32Workload, NeonGreaterUint8Workload>(descriptor, info);
-}
-
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDebug(const DebugQueueDescriptor& descriptor,
-                                                            const WorkloadInfo& info) const
-{
-    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
-}
-
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateRsqrt(const RsqrtQueueDescriptor &descriptor,
                                                             const WorkloadInfo &info) const
 {
     return std::make_unique<NeonRsqrtWorkload>(descriptor, info);
 }
 
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateTransposeConvolution2d(
-    const TransposeConvolution2dQueueDescriptor &descriptor,
-    const WorkloadInfo &info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
+                                                              const WorkloadInfo& info) const
 {
-    return std::make_unique<NeonTransposeConvolution2dWorkload>(descriptor, info,
-                                                                m_MemoryManager->GetIntraLayerManager());
+    return MakeWorkloadHelper<NeonSoftmaxFloatWorkload, NeonSoftmaxUint8Workload>(
+        descriptor, info, m_MemoryManager->GetIntraLayerManager());
 }
 
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreatePreCompiled(const PreCompiledQueueDescriptor& descriptor,
-                                                                  const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSpaceToBatchNd(const SpaceToBatchNdQueueDescriptor& descriptor,
+                                                                     const WorkloadInfo& info) const
 {
-    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
+    return nullptr;
 }
 
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateGather(const armnn::GatherQueueDescriptor& descriptor,
-                                                             const armnn::WorkloadInfo& info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSpaceToDepth(
+    const armnn::SpaceToDepthQueueDescriptor& descriptor, const armnn::WorkloadInfo& info) const
 {
-    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
+    return std::make_unique<NeonSpaceToDepthWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
+                                                               const WorkloadInfo&            info) const
+{
+    return std::make_unique<NeonSplitterWorkload>(descriptor, info);
 }
 
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateStack(const StackQueueDescriptor& descriptor,
@@ -442,10 +435,24 @@
     return std::make_unique<NeonStackWorkload>(descriptor, info);
 }
 
-std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateArgMinMax(const ArgMinMaxQueueDescriptor& descriptor,
-                                                                const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateStridedSlice(const StridedSliceQueueDescriptor& descriptor,
+                                                                   const WorkloadInfo& info) const
 {
-    return std::make_unique<NeonArgMinMaxWorkload>(descriptor, info);
+    return std::make_unique<NeonStridedSliceWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateSubtraction(
+    const SubtractionQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonSubtractionWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateTransposeConvolution2d(
+    const TransposeConvolution2dQueueDescriptor &descriptor,
+    const WorkloadInfo &info) const
+{
+    return std::make_unique<NeonTransposeConvolution2dWorkload>(descriptor, info,
+                                                                m_MemoryManager->GetIntraLayerManager());
 }
 
 } // namespace armnn
diff --git a/src/backends/neon/NeonWorkloadFactory.hpp b/src/backends/neon/NeonWorkloadFactory.hpp
index aad9cf9..9546164 100644
--- a/src/backends/neon/NeonWorkloadFactory.hpp
+++ b/src/backends/neon/NeonWorkloadFactory.hpp
@@ -39,43 +39,42 @@
                                                       DataLayout dataLayout,
                                                       const bool IsMemoryManaged = true) const override;
 
-    std::unique_ptr<IWorkload> CreateInput(const InputQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateOutput(const OutputQueueDescriptor& descriptor,
-                                            const WorkloadInfo& info) const override;
-
     std::unique_ptr<IWorkload> CreateAbs(const AbsQueueDescriptor& descriptor,
                                          const WorkloadInfo& info) const override;
 
     std::unique_ptr<IWorkload> CreateActivation(const ActivationQueueDescriptor& descriptor,
                                                 const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
-                                             const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateSplitter(const SplitterQueueDescriptor& descriptor,
+    std::unique_ptr<IWorkload> CreateAddition(const AdditionQueueDescriptor& descriptor,
                                               const WorkloadInfo& info) const override;
 
-    ARMNN_DEPRECATED_MSG("Use CreateConcat instead")
-    std::unique_ptr<IWorkload> CreateMerger(const MergerQueueDescriptor& descriptor,
-                                            const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateFullyConnected(const FullyConnectedQueueDescriptor& descriptor,
-                                                    const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreatePermute(const PermuteQueueDescriptor& descriptor,
-                                             const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+    std::unique_ptr<IWorkload> CreateArgMinMax(const ArgMinMaxQueueDescriptor& descriptor,
                                                const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreatePrelu(const PreluQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info) const override;
+    std::unique_ptr<IWorkload> CreateBatchNormalization(const BatchNormalizationQueueDescriptor& descriptor,
+                                                        const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateBatchToSpaceNd(const BatchToSpaceNdQueueDescriptor& descriptor,
+                                                    const WorkloadInfo& Info) const override;
+
+    std::unique_ptr<IWorkload> CreateConcat(const ConcatQueueDescriptor& descriptor,
+                                            const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
+                                              const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+                                                       const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+                                                       const WorkloadInfo& info) const override;
 
     std::unique_ptr<IWorkload> CreateConvolution2d(const Convolution2dQueueDescriptor& descriptor,
                                                    const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateDebug(const DebugQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateDepthToSpace(const DepthToSpaceQueueDescriptor& descriptor,
                                                   const WorkloadInfo& info) const override;
 
@@ -88,17 +87,44 @@
     std::unique_ptr<IWorkload> CreateDetectionPostProcess(const DetectionPostProcessQueueDescriptor& descriptor,
                                                           const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreateNormalization(const NormalizationQueueDescriptor& descriptor,
-                                                   const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateMultiplication(const MultiplicationQueueDescriptor& descriptor,
-                                                    const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateAddition(const AdditionQueueDescriptor& descriptor,
+    std::unique_ptr<IWorkload> CreateDivision(const DivisionQueueDescriptor& descriptor,
                                               const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreateBatchNormalization(const BatchNormalizationQueueDescriptor& descriptor,
-                                                        const WorkloadInfo& info) const override;
+    std::unique_ptr<IWorkload> CreateEqual(const EqualQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateFakeQuantization(const FakeQuantizationQueueDescriptor& descriptor,
+                                                      const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateFullyConnected(const FullyConnectedQueueDescriptor& descriptor,
+                                                    const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateGather(const GatherQueueDescriptor& descriptor,
+                                            const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateGreater(const GreaterQueueDescriptor& descriptor,
+                                             const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateInput(const InputQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateInstanceNormalization(const InstanceNormalizationQueueDescriptor& descriptor,
+                                                           const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
+                                                     const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor,
+                                          const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateMaximum(const MaximumQueueDescriptor& descriptor,
+                                             const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateMean(const MeanQueueDescriptor& descriptor,
+                                          const WorkloadInfo& Info) const override;
 
     std::unique_ptr<IWorkload> CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
                                              const WorkloadInfo& info) const override;
@@ -106,6 +132,46 @@
     std::unique_ptr<IWorkload> CreateMemImport(const MemImportQueueDescriptor& descriptor,
                                                const WorkloadInfo& info) const override;
 
+    ARMNN_DEPRECATED_MSG("Use CreateConcat instead")
+    std::unique_ptr<IWorkload> CreateMerger(const MergerQueueDescriptor& descriptor,
+                                            const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateMinimum(const MinimumQueueDescriptor& descriptor,
+                                             const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateMultiplication(const MultiplicationQueueDescriptor& descriptor,
+                                                    const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateNormalization(const NormalizationQueueDescriptor& descriptor,
+                                                   const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateOutput(const OutputQueueDescriptor& descriptor,
+                                            const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreatePad(const PadQueueDescriptor& descriptor,
+                                         const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreatePermute(const PermuteQueueDescriptor& descriptor,
+                                             const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+                                               const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreatePreCompiled(const PreCompiledQueueDescriptor& descriptor,
+                                                 const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreatePrelu(const PreluQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateQuantize(const QuantizeQueueDescriptor& descriptor,
+                                              const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateQuantizedLstm(const QuantizedLstmQueueDescriptor& descriptor,
+                                                   const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateReshape(const ReshapeQueueDescriptor& descriptor,
+                                             const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateResize(const ResizeQueueDescriptor& descriptor,
                                             const WorkloadInfo& info) const override;
 
@@ -113,22 +179,10 @@
     std::unique_ptr<IWorkload> CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor,
                                                     const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreateFakeQuantization(const FakeQuantizationQueueDescriptor& descriptor,
-                                                      const WorkloadInfo& info) const override;
+    std::unique_ptr<IWorkload> CreateRsqrt(const RsqrtQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreateQuantize(const QuantizeQueueDescriptor& descriptor,
-                                              const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
-                                                     const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateConcat(const ConcatQueueDescriptor& descriptor,
-                                            const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
-                                              const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateReshape(const ReshapeQueueDescriptor& descriptor,
+    std::unique_ptr<IWorkload> CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
                                              const WorkloadInfo& info) const override;
 
     std::unique_ptr<IWorkload> CreateSpaceToBatchNd(const SpaceToBatchNdQueueDescriptor& descriptor,
@@ -137,71 +191,20 @@
     std::unique_ptr<IWorkload> CreateSpaceToDepth(const SpaceToDepthQueueDescriptor& descriptor,
                                                   const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor,
-                                          const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateQuantizedLstm(const QuantizedLstmQueueDescriptor& descriptor,
-                                                   const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
-                                                       const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
-                                                       const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateDivision(const DivisionQueueDescriptor& descriptor,
+    std::unique_ptr<IWorkload> CreateSplitter(const SplitterQueueDescriptor& descriptor,
                                               const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
-                                                 const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateMaximum(const MaximumQueueDescriptor& descriptor,
-                                             const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateMean(const MeanQueueDescriptor& descriptor,
-                                          const WorkloadInfo& Info) const override;
-
-    std::unique_ptr<IWorkload> CreatePad(const PadQueueDescriptor& descriptor,
-                                         const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateEqual(const EqualQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateBatchToSpaceNd(const BatchToSpaceNdQueueDescriptor& descriptor,
-                                                    const WorkloadInfo& Info) const override;
-
-    std::unique_ptr<IWorkload> CreateStridedSlice(const StridedSliceQueueDescriptor& descriptor,
-                                                  const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateMinimum(const MinimumQueueDescriptor& descriptor,
-                                             const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateGreater(const GreaterQueueDescriptor& descriptor,
-                                             const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateDebug(const DebugQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateTransposeConvolution2d(const TransposeConvolution2dQueueDescriptor& descriptor,
-                                                            const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateRsqrt(const RsqrtQueueDescriptor& descriptor,
-                                           const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreatePreCompiled(const PreCompiledQueueDescriptor& descriptor,
-                                                 const WorkloadInfo& info) const override;
-
-    std::unique_ptr<IWorkload> CreateGather(const GatherQueueDescriptor& descriptor,
-                                            const WorkloadInfo& info) const override;
-
     std::unique_ptr<IWorkload> CreateStack(const StackQueueDescriptor& descriptor,
                                            const WorkloadInfo& info) const override;
 
-    std::unique_ptr<IWorkload> CreateArgMinMax(const ArgMinMaxQueueDescriptor& descriptor,
-                                               const WorkloadInfo& info) const override;
+    std::unique_ptr<IWorkload> CreateStridedSlice(const StridedSliceQueueDescriptor& descriptor,
+                                                  const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
+                                                 const WorkloadInfo& info) const override;
+
+    std::unique_ptr<IWorkload> CreateTransposeConvolution2d(const TransposeConvolution2dQueueDescriptor& descriptor,
+                                                            const WorkloadInfo& info) const override;
 
 private:
     mutable std::shared_ptr<NeonMemoryManager> m_MemoryManager;
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index fb10a0d..c66f5da 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -37,6 +37,7 @@
         workloads/NeonFloorFloatWorkload.cpp \
         workloads/NeonFullyConnectedWorkload.cpp \
         workloads/NeonGreaterWorkload.cpp \
+        workloads/NeonInstanceNormalizationWorkload.cpp \
         workloads/NeonL2NormalizationFloatWorkload.cpp \
         workloads/NeonLstmFloatWorkload.cpp \
         workloads/NeonMaximumWorkload.cpp \
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 0d1faa9..1d8aa11 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -473,6 +473,13 @@
 ARMNN_AUTO_TEST_CASE(BatchNormFloat32, BatchNormFloat32Test)
 ARMNN_AUTO_TEST_CASE(BatchNormFloat32Nhwc, BatchNormFloat32NhwcTest)
 
+// InstanceNormalization
+ARMNN_AUTO_TEST_CASE(InstanceNormFloat32Nchw, InstanceNormFloat32Test, DataLayout::NCHW);
+ARMNN_AUTO_TEST_CASE(InstanceNormFloat32Nhwc, InstanceNormFloat32Test, DataLayout::NHWC);
+
+ARMNN_AUTO_TEST_CASE(InstanceNormFloat32Nchw2, InstanceNormFloat32Test2, DataLayout::NCHW);
+ARMNN_AUTO_TEST_CASE(InstanceNormFloat32Nhwc2, InstanceNormFloat32Test2, DataLayout::NHWC);
+
 // Constant
 ARMNN_AUTO_TEST_CASE(Constant, ConstantTest)
 ARMNN_AUTO_TEST_CASE(ConstantUint8, ConstantUint8SimpleQuantizationScaleNoOffsetTest)
@@ -518,6 +525,7 @@
 ARMNN_AUTO_TEST_CASE(Concat4dDiffShapeDim0Uint8, Concat4dDiffShapeDim0Uint8Test)
 ARMNN_AUTO_TEST_CASE(Concat4dDiffShapeDim1Uint8, Concat4dDiffShapeDim1Uint8Test)
 ARMNN_AUTO_TEST_CASE(Concat4dDiffShapeDim3Uint8, Concat4dDiffShapeDim3Uint8Test, false)
+
 // L2 Normalization
 ARMNN_AUTO_TEST_CASE(L2Normalization1d, L2Normalization1dTest, DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(L2Normalization2d, L2Normalization2dTest, DataLayout::NCHW)
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index f8d5922..46e4928 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -36,6 +36,8 @@
     NeonFullyConnectedWorkload.hpp
     NeonGreaterWorkload.cpp
     NeonGreaterWorkload.hpp
+    NeonInstanceNormalizationWorkload.cpp
+    NeonInstanceNormalizationWorkload.hpp
     NeonL2NormalizationFloatWorkload.cpp
     NeonL2NormalizationFloatWorkload.hpp
     NeonLstmFloatWorkload.cpp
diff --git a/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.cpp b/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.cpp
new file mode 100644
index 0000000..d6c3081
--- /dev/null
+++ b/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.cpp
@@ -0,0 +1,60 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonInstanceNormalizationWorkload.hpp"
+
+#include "NeonWorkloadUtils.hpp"
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+#include <backendsCommon/CpuTensorHandle.hpp>
+#include <neon/NeonTensorHandle.hpp>
+
+using namespace armnn::armcomputetensorutils;
+
+namespace armnn
+{
+
+arm_compute::Status NeonInstanceNormalizationWorkloadValidate(const TensorInfo& input,
+                                                              const TensorInfo& output,
+                                                              const InstanceNormalizationDescriptor& descriptor)
+{
+    const arm_compute::TensorInfo aclInputInfo  = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+
+    return arm_compute::NEInstanceNormalizationLayer::validate(&aclInputInfo,
+                                                               &aclOutputInfo,
+                                                               descriptor.m_Gamma,
+                                                               descriptor.m_Beta,
+                                                               descriptor.m_Eps);
+}
+
+NeonInstanceNormalizationWorkload::NeonInstanceNormalizationWorkload(
+    const InstanceNormalizationQueueDescriptor& descriptor,
+    const WorkloadInfo& info)
+    : BaseWorkload<InstanceNormalizationQueueDescriptor>(descriptor, info)
+{
+    m_Data.ValidateInputsOutputs("NeonInstanceNormalizationWorkload", 1, 1);
+
+    arm_compute::ITensor& input  = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ITensor& output = static_cast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
+    input.info()->set_data_layout(aclDataLayout);
+    output.info()->set_data_layout(aclDataLayout);
+
+    m_Layer.configure(&input,
+                      &output,
+                      descriptor.m_Parameters.m_Gamma,
+                      descriptor.m_Parameters.m_Beta,
+                      descriptor.m_Parameters.m_Eps);
+};
+
+void NeonInstanceNormalizationWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonInstanceNormalizationWorkload_Execute");
+    m_Layer.run();
+}
+
+} // namespace armnn
\ No newline at end of file
diff --git a/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.hpp b/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.hpp
new file mode 100644
index 0000000..4d40837
--- /dev/null
+++ b/src/backends/neon/workloads/NeonInstanceNormalizationWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h>
+
+namespace armnn
+{
+
+arm_compute::Status NeonInstanceNormalizationWorkloadValidate(const TensorInfo& input,
+                                                              const TensorInfo& output,
+                                                              const InstanceNormalizationDescriptor& descriptor);
+
+class NeonInstanceNormalizationWorkload : public BaseWorkload<InstanceNormalizationQueueDescriptor>
+{
+public:
+    NeonInstanceNormalizationWorkload(const InstanceNormalizationQueueDescriptor& descriptor,
+                                      const WorkloadInfo& info);
+    void Execute() const override;
+
+private:
+    mutable arm_compute::NEInstanceNormalizationLayer m_Layer;
+};
+
+} // namespace armnn
\ No newline at end of file
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index 8044a4f..7d99d26 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -19,6 +19,7 @@
 #include "NeonFloorFloatWorkload.hpp"
 #include "NeonFullyConnectedWorkload.hpp"
 #include "NeonGreaterWorkload.hpp"
+#include "NeonInstanceNormalizationWorkload.hpp"
 #include "NeonL2NormalizationFloatWorkload.hpp"
 #include "NeonLstmFloatWorkload.hpp"
 #include "NeonQuantizedLstmWorkload.hpp"