COMPMID-812 Add NHWC data format support for NEON depthwise convolution (optimized case).

Change-Id: Icdfd6c02ed526daf4f59a4b76c7bbc1bc48fde74
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125938
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 8ac882c..1779ff1 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -54,14 +54,17 @@
 using CLDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T>;
 
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset(), framework::dataset::make("DataType",
-                                                                                                                 DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                                                                                 framework::dataset::make("DataType",
+                                                                                                                         DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)))
+                                                                                                                             DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -73,15 +76,17 @@
 TEST_SUITE(Float)
 TEST_SUITE(F16)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)))
+                                                                                                                           DataType::F16)),
+                                                                                                                   framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F16)))
+                                                                                                                               DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
@@ -90,15 +95,17 @@
 
 TEST_SUITE(FP32)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)))
+                                                                                                                            DataType::F32)),
+                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
diff --git a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
index cacf696..2baa93e 100644
--- a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,15 +55,17 @@
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)))
+                                                                                                                           DataType::F16)),
+                                                                                                                   framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F16)))
+                                                                                                                               DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
 }
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 0cdd4c0..49e146c 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -95,15 +95,17 @@
 TEST_SUITE(Generic)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)))
+                                                                                                                               DataType::F32)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)))
+                                                                                                                             DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -112,21 +114,24 @@
 TEST_SUITE(W3x3)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)))
+                                                                                                                            DataType::F32)),
+                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index df5436f..ccdd443 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -52,15 +52,22 @@
 
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info)
+    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout)
     {
         _quantization_info = quantization_info;
         _data_type         = data_type;
         const TensorShape biases_shape(weights_shape[2]);
         const DataType    bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
 
-        _target    = compute_target(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info);
-        _reference = compute_reference(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info);
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(in_shape, PermutationVector(2U, 0U, 1U));
+            permute(weights_shape, PermutationVector(2U, 0U, 1U));
+            permute(out_shape, PermutationVector(2U, 0U, 1U));
+        }
+
+        _target    = compute_target(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info, data_layout);
+        _reference = compute_reference(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info, data_layout);
     }
 
 protected:
@@ -94,13 +101,13 @@
     }
 
     TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &biases_shape, const TensorShape &output_shape, PadStrideInfo &pad_stride_info,
-                              const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info)
+                              const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info, const DataLayout data_layout)
     {
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, 0, quantization_info);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, 0, quantization_info);
-        TensorType biases  = create_tensor<TensorType>(biases_shape, bias_data_type, 1, 0, quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, 0, quantization_info);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, 0, quantization_info, data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, 0, quantization_info, data_layout);
+        TensorType biases  = create_tensor<TensorType>(biases_shape, bias_data_type, 1, 0, quantization_info, data_layout);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, 0, quantization_info, data_layout);
 
         // Create Depthwise Convolution configure function
         FunctionType dwc;
@@ -134,11 +141,11 @@
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &in_shape, const TensorShape &weights_shape, const TensorShape &biases_shape, const TensorShape &out_shape, const PadStrideInfo &pad_stride_info,
-                                      const DataType data_type, const DataType bias_data_type, QuantizationInfo quantization_info)
+                                      const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info, const DataLayout data_layout)
     {
-        SimpleTensor<T>     src{ in_shape, data_type, 1, 0, quantization_info };
-        SimpleTensor<T>     weights{ weights_shape, data_type, 1, 0, quantization_info };
-        SimpleTensor<TBias> biases{ biases_shape, bias_data_type, 1, 0, quantization_info };
+        SimpleTensor<T>     src{ in_shape, data_type, 1, 0, quantization_info, data_layout };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, 0, quantization_info, data_layout };
+        SimpleTensor<TBias> biases{ biases_shape, bias_data_type, 1, 0, quantization_info, data_layout };
 
         fill(src, 0);
         fill(weights, 1);
@@ -158,10 +165,10 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type)
+    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, DataLayout data_layout)
     {
         DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, weights_shape, out_shape, pad_stride_info,
-                                                                                                            data_type, QuantizationInfo());
+                                                                                                            data_type, QuantizationInfo(), data_layout);
     }
 };
 
@@ -173,7 +180,7 @@
     void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info)
     {
         DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, weights_shape, out_shape, pad_stride_info,
-                                                                                                            data_type, quantization_info);
+                                                                                                            data_type, quantization_info, DataLayout::NCHW);
     }
 };
 } // namespace validation
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index b2a7067..ab61b7d 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
@@ -24,6 +24,7 @@
 #include "DepthwiseConvolutionLayer.h"
 
 #include "ConvolutionLayer.h"
+#include "Permute.h"
 #include "Utils.h"
 
 #include "tests/validation/FixedPoint.h"
@@ -50,11 +51,8 @@
  *
  */
 template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info)
+void depthwise_convolution_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
 {
-    // Create reference
-    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
-
     // Compute reference
     const int filter_width  = weights.shape().x();
     const int filter_height = weights.shape().y();
@@ -108,8 +106,6 @@
             }
         }
     }
-
-    return dst;
 }
 
 template <>
@@ -195,6 +191,27 @@
     return dst;
 }
 
+template <typename T, typename TB>
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info)
+{
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
+
+    if(src.data_layout() == DataLayout::NHWC && src.data_type() == DataType::F32)
+    {
+        SimpleTensor<T> src_nchw     = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T> dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
+
+        depthwise_convolution_nchw<T, TB>(src_nchw, weights_nchw, biases, dst_nchw, conv_info);
+
+        return reference::permute<T>(dst_nchw, PermutationVector(2U, 0U, 1U));
+    }
+
+    depthwise_convolution_nchw<T, TB>(src, weights, biases, dst, conv_info);
+
+    return dst;
+}
+
 template SimpleTensor<float> depthwise_convolution(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &biases, const TensorShape &dst_shape,
                                                    const PadStrideInfo &conv_info);