COMPMID-926 Add depth multiplier support to NEON/CL/GLES depthwise convolution

Change-Id: I03f32c62350e5ea43e77bb15fc5a832d83719e3b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/126657
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/tests/benchmark/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/benchmark/fixtures/DepthwiseConvolutionLayerFixture.h
index 9d29aee..9276431 100644
--- a/tests/benchmark/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/benchmark/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/Globals.h"
 #include "tests/Utils.h"
 #include "tests/framework/Fixture.h"
@@ -36,14 +37,25 @@
 {
 namespace benchmark
 {
+using namespace arm_compute::misc::shape_calculator;
+
 /** Fixture that can be used for NEON and CL */
 template <typename TensorType, typename Function, typename Accessor>
 class DepthwiseConvolutionLayerFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape src_shape, TensorShape weights_shape, TensorShape dst_shape, PadStrideInfo info, DataType data_type, int batches)
+    void setup(TensorShape src_shape, Size2D kernel_size, PadStrideInfo info, DataType data_type, int batches)
     {
+        // Get shapes
+        TensorShape weights_shape(kernel_size.width, kernel_size.height);
+
+        const TensorInfo in_info(src_shape, 1, data_type);
+        const TensorInfo we_info(weights_shape, 1, data_type);
+        TensorShape      dst_shape = compute_depthwise_convolution_shape(in_info, we_info, info, 1);
+
+        weights_shape.set(2, dst_shape.z());
+
         // Set batched in source and destination shapes
         const unsigned int fixed_point_position = 4;
         src_shape.set(3 /* batch */, batches);
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 1e77a0c..4b6d85b 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -38,17 +38,15 @@
 class DepthwiseConvolutionLayerDataset
 {
 public:
-    using type = std::tuple<TensorShape, TensorShape, TensorShape, PadStrideInfo>;
+    using type = std::tuple<TensorShape, Size2D, PadStrideInfo>;
 
     struct iterator
     {
         iterator(std::vector<TensorShape>::const_iterator   src_it,
-                 std::vector<TensorShape>::const_iterator   weights_it,
-                 std::vector<TensorShape>::const_iterator   dst_it,
+                 std::vector<Size2D>::const_iterator        weights_it,
                  std::vector<PadStrideInfo>::const_iterator infos_it)
             : _src_it{ std::move(src_it) },
               _weights_it{ std::move(weights_it) },
-              _dst_it{ std::move(dst_it) },
               _infos_it{ std::move(infos_it) }
         {
         }
@@ -58,21 +56,19 @@
             std::stringstream description;
             description << "In=" << *_src_it << ":";
             description << "Weights=" << *_weights_it << ":";
-            description << "Out=" << *_dst_it << ":";
             description << "Info=" << *_infos_it;
             return description.str();
         }
 
         DepthwiseConvolutionLayerDataset::type operator*() const
         {
-            return std::make_tuple(*_src_it, *_weights_it, *_dst_it, *_infos_it);
+            return std::make_tuple(*_src_it, *_weights_it, *_infos_it);
         }
 
         iterator &operator++()
         {
             ++_src_it;
             ++_weights_it;
-            ++_dst_it;
             ++_infos_it;
 
             return *this;
@@ -80,26 +76,24 @@
 
     private:
         std::vector<TensorShape>::const_iterator   _src_it;
-        std::vector<TensorShape>::const_iterator   _weights_it;
-        std::vector<TensorShape>::const_iterator   _dst_it;
+        std::vector<Size2D>::const_iterator        _weights_it;
         std::vector<PadStrideInfo>::const_iterator _infos_it;
     };
 
     iterator begin() const
     {
-        return iterator(_src_shapes.begin(), _weight_shapes.begin(), _dst_shapes.begin(), _infos.begin());
+        return iterator(_src_shapes.begin(), _weight_shapes.begin(), _infos.begin());
     }
 
     int size() const
     {
-        return std::min(_src_shapes.size(), std::min(_weight_shapes.size(), std::min(_dst_shapes.size(), _infos.size())));
+        return std::min(_src_shapes.size(), std::min(_weight_shapes.size(), _infos.size()));
     }
 
-    void add_config(TensorShape src, TensorShape weights, TensorShape dst, PadStrideInfo info)
+    void add_config(TensorShape src, Size2D weights, PadStrideInfo info)
     {
         _src_shapes.emplace_back(std::move(src));
         _weight_shapes.emplace_back(std::move(weights));
-        _dst_shapes.emplace_back(std::move(dst));
         _infos.emplace_back(std::move(info));
     }
 
@@ -109,8 +103,7 @@
 
 private:
     std::vector<TensorShape>   _src_shapes{};
-    std::vector<TensorShape>   _weight_shapes{};
-    std::vector<TensorShape>   _dst_shapes{};
+    std::vector<Size2D>        _weight_shapes{};
     std::vector<PadStrideInfo> _infos{};
 };
 
@@ -120,20 +113,20 @@
 public:
     SmallDepthwiseConvolutionLayerDataset()
     {
-        add_config(TensorShape(7U, 7U, 1U), TensorShape(3U, 3U, 1U), TensorShape(5U, 5U, 1U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(23U, 27U, 5U), TensorShape(3U, 5U, 5U), TensorShape(11U, 23U, 5U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(7U, 3U, 7U), TensorShape(10U, 13U, 7U), PadStrideInfo(3, 2, 1, 0));
-        add_config(TensorShape(33U, 27U, 11U), TensorShape(3U, 3U, 11U), TensorShape(31U, 14U, 11U), PadStrideInfo(1, 2, 0, 1));
-        add_config(TensorShape(17U, 31U, 2U), TensorShape(5U, 9U, 2U), TensorShape(15U, 13U, 2U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(23U, 27U, 5U), TensorShape(11U, 3U, 5U), TensorShape(13U, 13U, 5U), PadStrideInfo(1, 2, 0, 0));
-        add_config(TensorShape(17U, 31U, 2U, 3U), TensorShape(5U, 9U, 2U), TensorShape(15U, 13U, 2U, 3U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(7U, 7U, 1U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(23U, 27U, 5U), Size2D(3U, 5U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(7U, 3U), PadStrideInfo(3, 2, 1, 0));
+        add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1));
+        add_config(TensorShape(17U, 31U, 2U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(23U, 27U, 5U), Size2D(11U, 3U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(17U, 31U, 2U, 3U), Size2D(5U, 9U), PadStrideInfo(1, 2, 1, 1));
         // Asymmetric padding
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(10U, 11U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(10U, 11U, 7U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), Size2D(5U, 7U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
     }
 };
 
@@ -143,12 +136,12 @@
 public:
     LargeDepthwiseConvolutionLayerDataset()
     {
-        add_config(TensorShape(233U, 277U, 55U), TensorShape(3U, 3U, 55U), TensorShape(116U, 275U, 55U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U), TensorShape(3U, 3U, 77U), TensorShape(111U, 138U, 77U), PadStrideInfo(3, 2, 1, 0));
-        add_config(TensorShape(177U, 311U, 22U), TensorShape(3U, 3U, 22U), TensorShape(177U, 156U, 22U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U), TensorShape(3U, 3U, 55U), TensorShape(231U, 138U, 55U), PadStrideInfo(1, 2, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U), TensorShape(3U, 3U, 77U), TensorShape(166U, 93U, 77U), PadStrideInfo(2, 3, 0, 1));
-        add_config(TensorShape(177U, 311U, 22U), TensorShape(3U, 3U, 22U), TensorShape(89U, 311U, 22U), PadStrideInfo(2, 1, 1, 1));
+        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(3, 2, 1, 0));
+        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(333U, 277U, 77U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 1));
+        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
     }
 };
 
@@ -158,12 +151,12 @@
 public:
     SmallDepthwiseConvolutionLayerDataset3x3()
     {
-        add_config(TensorShape(3U, 3U, 2U), TensorShape(3U, 3U, 2U), TensorShape(1U, 1U, 2U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(7U, 7U, 3U, 2U), TensorShape(3U, 3U, 3U), TensorShape(5U, 5U, 3U, 2U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(21U, 31U, 9U, 4U), TensorShape(3U, 3U, 9U), TensorShape(21U, 15U, 9U, 4U), PadStrideInfo(1, 2, 1, 0));
-        add_config(TensorShape(33U, 27U, 11U, 3U), TensorShape(3U, 3U, 11U), TensorShape(31U, 14U, 11U, 3U), PadStrideInfo(1, 2, 0, 1));
+        add_config(TensorShape(3U, 3U, 2U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(7U, 7U, 3U, 2U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 0));
+        add_config(TensorShape(33U, 27U, 11U, 3U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 1));
         // Asymmetric padding
-        add_config(TensorShape(33U, 27U, 11U), TensorShape(3U, 3U, 11U), TensorShape(16U, 13U, 11U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
     }
 };
 
@@ -172,9 +165,9 @@
 public:
     SmallDepthwiseConvolutionLayerDataset3x3NCHW()
     {
-        add_config(TensorShape(33U, 27U, 11U), TensorShape(3U, 3U, 11U), TensorShape(11U, 14U, 11U), PadStrideInfo(3, 2, 1, 1));
+        add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(3, 2, 1, 1));
         // Asymmetric padding
-        add_config(TensorShape(33U, 27U, 11U), TensorShape(3U, 3U, 11U), TensorShape(18U, 14U, 11U), PadStrideInfo(2, 2, 3, 1, 2, 1, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(2, 2, 3, 1, 2, 1, DimensionRoundingType::FLOOR));
     }
 };
 
@@ -184,11 +177,11 @@
 public:
     LargeDepthwiseConvolutionLayerDataset3x3()
     {
-        add_config(TensorShape(233U, 277U, 55U, 3U), TensorShape(3U, 3U, 55U), TensorShape(116U, 275U, 55U, 3U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(177U, 311U, 22U), TensorShape(3U, 3U, 22U), TensorShape(177U, 156U, 22U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(233U, 277U, 55U), TensorShape(3U, 3U, 55U), TensorShape(231U, 138U, 55U), PadStrideInfo(1, 2, 0, 0));
-        add_config(TensorShape(333U, 277U, 77U, 5U), TensorShape(3U, 3U, 77U), TensorShape(166U, 93U, 77U, 5U), PadStrideInfo(2, 3, 0, 1));
-        add_config(TensorShape(177U, 311U, 22U), TensorShape(3U, 3U, 22U), TensorShape(89U, 311U, 22U), PadStrideInfo(2, 1, 1, 1));
+        add_config(TensorShape(233U, 277U, 55U, 3U), Size2D(3U, 3U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(233U, 277U, 55U), Size2D(3U, 3U), PadStrideInfo(1, 2, 0, 0));
+        add_config(TensorShape(333U, 277U, 77U, 5U), Size2D(3U, 3U), PadStrideInfo(2, 3, 0, 1));
+        add_config(TensorShape(177U, 311U, 22U), Size2D(3U, 3U), PadStrideInfo(2, 1, 1, 1));
     }
 };
 
@@ -199,16 +192,16 @@
     OptimizedDepthwiseConvolutionLayerDataset3x3()
     {
         // Stride 1
-        add_config(TensorShape(7U, 7U, 16U), TensorShape(3U, 3U, 16U), TensorShape(5U, 5U, 16U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
-        add_config(TensorShape(7U, 7U, 16U), TensorShape(3U, 3U, 16U), TensorShape(7U, 7U, 16U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
-        add_config(TensorShape(28U, 28U, 16U), TensorShape(3U, 3U, 16U), TensorShape(26U, 26U, 16U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
-        add_config(TensorShape(28U, 28U, 16U), TensorShape(3U, 3U, 16U), TensorShape(28U, 28U, 16U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(7U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
+        add_config(TensorShape(7U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
+        add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
         // Stride 2
-        add_config(TensorShape(7U, 7U, 32U), TensorShape(3U, 3U, 32U), TensorShape(3U, 3U, 32U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
-        add_config(TensorShape(7U, 7U, 32U), TensorShape(3U, 3U, 32U), TensorShape(4U, 4U, 32U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
-        add_config(TensorShape(8U, 8U, 32U), TensorShape(3U, 3U, 32U), TensorShape(3U, 3U, 32U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
-        add_config(TensorShape(8U, 8U, 32U), TensorShape(3U, 3U, 32U), TensorShape(4U, 4U, 32U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
-        add_config(TensorShape(64U, 64U, 128U), TensorShape(3U, 3U, 128U), TensorShape(32U, 32U, 128U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(7U, 7U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL));
+        add_config(TensorShape(7U, 7U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
+        add_config(TensorShape(64U, 64U, 128U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
     }
 };
 } // namespace datasets
diff --git a/tests/datasets/system_tests/mobilenet/MobileNetDepthwiseConvolutionLayerDataset.h b/tests/datasets/system_tests/mobilenet/MobileNetDepthwiseConvolutionLayerDataset.h
index c4c199a..25ac1c1 100644
--- a/tests/datasets/system_tests/mobilenet/MobileNetDepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/mobilenet/MobileNetDepthwiseConvolutionLayerDataset.h
@@ -42,15 +42,15 @@
 public:
     MobileNetDepthwiseConvolutionLayerDataset()
     {
-        add_config(TensorShape(7U, 7U, 1024U), TensorShape(3U, 3U, 1024U), TensorShape(7U, 7U, 1024U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U), TensorShape(7U, 7U, 512U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U), TensorShape(14U, 14U, 512U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(28U, 28U, 256U), TensorShape(3U, 3U, 256U), TensorShape(14U, 14U, 256U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(28U, 28U, 256U), TensorShape(3U, 3U, 256U), TensorShape(28U, 28U, 256U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(56U, 56U, 128U), TensorShape(3U, 3U, 128U), TensorShape(28U, 28U, 128U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(56U, 56U, 128U), TensorShape(3U, 3U, 128U), TensorShape(56U, 56U, 128U), PadStrideInfo(1, 1, 1, 1));
-        add_config(TensorShape(112U, 112U, 64U), TensorShape(3U, 3U, 64U), TensorShape(56U, 56U, 64U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
-        add_config(TensorShape(112U, 112U, 32U), TensorShape(3U, 3U, 32U), TensorShape(112U, 112U, 32U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(7U, 7U, 1024U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(14U, 14U, 512U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(14U, 14U, 512U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(28U, 28U, 256U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(28U, 28U, 256U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(56U, 56U, 128U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(56U, 56U, 128U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 112U, 64U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(112U, 112U, 32U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1));
     }
 };
 } // namespace datasets
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 450bb21..ad7a5d8 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -45,6 +45,8 @@
 RelativeTolerance<half_float::half>  tolerance_f16(half_float::half(0.001)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr RelativeTolerance<float>   tolerance_f32(0.01f);                   /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);                   /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+
+const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 });
 } // namespace
 
 TEST_SUITE(CL)
@@ -54,14 +56,15 @@
 using CLDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T>;
 
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
                                                                                                                  framework::dataset::make("DataType",
                                                                                                                          DataType::F32)),
                                                                                                                  framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                     depth_multipliers),
                                                                                                                      framework::dataset::make("DataType",
                                                                                                                              DataType::F32)),
                                                                                                                      framework::dataset::make("DataLayout", DataLayout::NCHW)))
@@ -77,15 +80,17 @@
 TEST_SUITE(F16)
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL,
-                       combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                  datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
+                       combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                          datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
+                                               depth_multipliers),
                                        framework::dataset::make("DataType",
                                                                 DataType::F16)),
                                framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                       depth_multipliers),
                                                                                                                        framework::dataset::make("DataType",
                                                                                                                                DataType::F16)),
                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
@@ -98,15 +103,17 @@
 TEST_SUITE(FP32)
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL,
-                       combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                  datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
+                       combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                          datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
+                                               depth_multipliers),
                                        framework::dataset::make("DataType",
                                                                 DataType::F32)),
                                framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                        depth_multipliers),
                                                                                                                         framework::dataset::make("DataType",
                                                                                                                                 DataType::F32)),
                                                                                                                         framework::dataset::make("DataLayout", DataLayout::NCHW)))
@@ -125,34 +132,41 @@
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END()
 TEST_SUITE(W3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                       combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                       framework::dataset::make("DepthMultiplier", 1)), // COMPMID-1071 Add depth multiplier support for NHWC
                                                framework::dataset::make("DataType", DataType::QASYMM8)),
                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
                                framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                       framework::dataset::make("DepthMultiplier", 1)), // COMPMID-1071 Add depth multiplier support for NHWC
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
diff --git a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
index 2baa93e..22b1e08 100644
--- a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
@@ -44,6 +44,8 @@
 {
 RelativeTolerance<half> tolerance_fp16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr float         tolerance_num = 0.07f;     /**< Tolerance number */
+
+const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 });
 } // namespace
 
 TEST_SUITE(GC)
@@ -55,14 +57,16 @@
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                   depth_multipliers),
                                                                                                                    framework::dataset::make("DataType",
                                                                                                                            DataType::F16)),
                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                       depth_multipliers),
                                                                                                                        framework::dataset::make("DataType",
                                                                                                                                DataType::F16)),
                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 236d4bd..b1cc491 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
@@ -40,20 +41,34 @@
 {
 namespace validation
 {
+using namespace arm_compute::misc::shape_calculator;
+
 namespace
 {
 constexpr RelativeTolerance<float>   tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+
+const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 });
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(DepthwiseConvLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                              datasets::LargeDepthwiseConvolutionLayerDataset3x3()),
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                      datasets::LargeDepthwiseConvolutionLayerDataset3x3()),
+                                                                           depth_multipliers),
                                                                    framework::dataset::make("DataType", DataType::F32)),
-               input_shape, weights_shape, output_shape, info, data_type)
+               input_shape, kernel_size, info, depth_multiplier, data_type)
 {
+    // Get shapes
+    TensorShape weights_shape(kernel_size.width, kernel_size.height);
+
+    const TensorInfo  in_info(input_shape, 1, data_type);
+    const TensorInfo  we_info(weights_shape, 1, data_type);
+    const TensorShape output_shape = compute_depthwise_convolution_shape(in_info, we_info, info, depth_multiplier);
+
+    weights_shape.set(2, output_shape.z());
+
     // Create tensors
     Tensor            src     = create_tensor<Tensor>(input_shape, data_type);
     Tensor            dst     = create_tensor<Tensor>(output_shape, data_type);
@@ -68,7 +83,7 @@
 
     // Create and configure function
     NEDepthwiseConvolutionLayer3x3 depthwise_layer;
-    depthwise_layer.configure(&src, &weights, &bias, &dst, info);
+    depthwise_layer.configure(&src, &weights, &bias, &dst, info, depth_multiplier);
 
     // Validate valid region
     const ValidRegion input_valid_region   = shape_to_valid_region(input_shape);
@@ -82,7 +97,7 @@
     validate(bias.info()->valid_region(), bias_valid_region);
 
     // Validate padding
-    bool              is_optimized_run = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input_shape, info, data_type, DataLayout::NCHW);
+    bool              is_optimized_run = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input_shape, info, data_type, depth_multiplier, DataLayout::NCHW);
     const int         step_non_opt_dwc = 16 >> info.stride().first;
     const int         step_bias_add    = 16 / src.info()->element_size();
     const int         step             = is_optimized_run ? step_bias_add : std::max(step_non_opt_dwc, step_bias_add);
@@ -95,14 +110,16 @@
 TEST_SUITE(Generic)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                                                                                       depth_multipliers),
                                                                                                                        framework::dataset::make("DataType",
                                                                                                                                DataType::F32)),
                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                     depth_multipliers),
                                                                                                                      framework::dataset::make("DataType",
                                                                                                                              DataType::F32)),
                                                                                                                      framework::dataset::make("DataLayout", DataLayout::NCHW)))
@@ -114,21 +131,24 @@
 TEST_SUITE(W3x3)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                    depth_multipliers),
                                                                                                                     framework::dataset::make("DataType",
                                                                                                                             DataType::F32)),
                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                        depth_multipliers),
                                                                                                                         framework::dataset::make("DataType",
                                                                                                                                 DataType::F32)),
                                                                                                                         framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                        framework::dataset::make("DepthMultiplier", 1)),
                                                                                                                         framework::dataset::make("DataType",
                                                                                                                                 DataType::F32)),
                                                                                                                         framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
@@ -148,27 +168,31 @@
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
-{
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
-}
-TEST_SUITE_END()
-TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                       depth_multipliers),
                                                framework::dataset::make("DataType", DataType::QASYMM8)),
                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
                                framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
+TEST_SUITE_END()
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index bb756f8..b7bca8d 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -44,6 +45,8 @@
 {
 namespace validation
 {
+using namespace arm_compute::misc::shape_calculator;
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class DepthwiseConvolutionLayerValidationGenericFixture : public framework::Fixture
 {
@@ -52,12 +55,20 @@
 
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout)
+    void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, unsigned int depth_multiplier, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout)
     {
-        _quantization_info = quantization_info;
-        _data_type         = data_type;
+        _quantization_info            = quantization_info;
+        _data_type                    = data_type;
+        const DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+
+        TensorShape weights_shape(kernel_size.width, kernel_size.height);
+
+        const TensorInfo in_info(in_shape, 1, data_type);
+        const TensorInfo we_info(weights_shape, 1, data_type);
+        TensorShape      out_shape = compute_depthwise_convolution_shape(in_info, we_info, pad_stride_info, depth_multiplier);
+
+        weights_shape.set(2, out_shape.z());
         const TensorShape biases_shape(weights_shape[2]);
-        const DataType    bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
 
         if(data_layout == DataLayout::NHWC)
         {
@@ -66,8 +77,8 @@
             permute(out_shape, PermutationVector(2U, 0U, 1U));
         }
 
-        _target    = compute_target(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info, data_layout);
-        _reference = compute_reference(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info, data_layout);
+        _target    = compute_target(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, depth_multiplier, data_type, bias_data_type, quantization_info, data_layout);
+        _reference = compute_reference(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, depth_multiplier, data_type, bias_data_type, quantization_info, data_layout);
     }
 
 protected:
@@ -101,6 +112,7 @@
     }
 
     TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &biases_shape, const TensorShape &output_shape, PadStrideInfo &pad_stride_info,
+                              unsigned int   depth_multiplier,
                               const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info, const DataLayout data_layout)
     {
         // Create tensors
@@ -111,7 +123,7 @@
 
         // Create Depthwise Convolution configure function
         FunctionType dwc;
-        dwc.configure(&src, &weights, &biases, &dst, pad_stride_info);
+        dwc.configure(&src, &weights, &biases, &dst, pad_stride_info, depth_multiplier);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -141,6 +153,7 @@
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &in_shape, const TensorShape &weights_shape, const TensorShape &biases_shape, const TensorShape &out_shape, const PadStrideInfo &pad_stride_info,
+                                      unsigned int   depth_multiplier,
                                       const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info, const DataLayout data_layout)
     {
         SimpleTensor<T>     src{ in_shape, data_type, 1, 0, quantization_info, data_layout };
@@ -151,7 +164,7 @@
         fill(weights, 1);
         fill(biases, 2);
 
-        return reference::depthwise_convolution(src, weights, biases, out_shape, pad_stride_info);
+        return reference::depthwise_convolution(src, weights, biases, out_shape, pad_stride_info, depth_multiplier);
     }
 
     TensorType       _target{};
@@ -165,9 +178,9 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, DataLayout data_layout)
+    void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, unsigned int depth_multiplier, DataType data_type, DataLayout data_layout)
     {
-        DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, weights_shape, out_shape, pad_stride_info,
+        DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, kernel_size, pad_stride_info, depth_multiplier,
                                                                                                             data_type, QuantizationInfo(), data_layout);
     }
 };
@@ -177,9 +190,9 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout)
+    void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, unsigned int depth_multiplier, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout)
     {
-        DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, weights_shape, out_shape, pad_stride_info,
+        DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, kernel_size, pad_stride_info, depth_multiplier,
                                                                                                             data_type, quantization_info, data_layout);
     }
 };
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index d05da91..207e5fc 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
@@ -51,7 +51,8 @@
  *
  */
 template <typename T, typename TB>
-void depthwise_convolution_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
+void depthwise_convolution_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, SimpleTensor<T> &dst, const PadStrideInfo &conv_info,
+                                unsigned int depth_multiplier)
 {
     // Compute reference
     const int filter_width  = weights.shape().x();
@@ -75,40 +76,47 @@
     const int maximum_x = input_width + pad_left - filter_half_width + pad_right - filter_half_width;
     const int maximum_y = input_height + pad_top - filter_half_height + pad_bottom - filter_half_height;
 
+    const T border_value(0);
+
     int out_pos = 0;
     for(int r = 0; r < num_batches; ++r)
     {
         for(int z = 0; z < input_depth; ++z)
         {
-            for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+            for(unsigned int m = 0; m < depth_multiplier; ++m)
             {
-                for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
-                {
-                    Coordinates coords(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), static_cast<int>(r));
-                    size_t      filter_offset = filter_plane * z;
+                const int out_z = z * depth_multiplier + m;
 
-                    T val(0);
-                    for(int j = y - filter_half_height; j <= static_cast<int>(y + filter_half_height); ++j)
+                for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+                {
+                    for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
                     {
-                        for(int i = x - filter_half_width; i <= static_cast<int>(x + filter_half_width); ++i)
+                        Coordinates coords(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), static_cast<int>(r));
+                        size_t      filter_offset = filter_plane * out_z;
+
+                        T val(0);
+                        for(int j = y - filter_half_height; j <= static_cast<int>(y + filter_half_height); ++j)
                         {
-                            coords.set(0, i);
-                            coords.set(1, j);
-                            T border_value(0);
-                            val += *(weights.data() + filter_offset) * tensor_elem_at(src, coords, BorderMode::CONSTANT, border_value);
-                            ++filter_offset;
+                            for(int i = x - filter_half_width; i <= static_cast<int>(x + filter_half_width); ++i)
+                            {
+                                coords.set(0, i);
+                                coords.set(1, j);
+
+                                val += *(weights.data() + filter_offset) * tensor_elem_at(src, coords, BorderMode::CONSTANT, border_value);
+                                ++filter_offset;
+                            }
                         }
+
+                        dst[out_pos++] = saturate_cast<T>(val + *static_cast<const TB *>(biases(Coordinates(out_z))));
                     }
-                    coords.set(0, x);
-                    coords.set(1, y);
-                    dst[out_pos++] = saturate_cast<T>(val + *static_cast<const TB *>(biases(Coordinates(z))));
                 }
             }
         }
     }
 }
 
-void depthwise_convolution_nchw(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, SimpleTensor<uint8_t> &dst, const PadStrideInfo &conv_info)
+void depthwise_convolution_nchw(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, SimpleTensor<uint8_t> &dst, const PadStrideInfo &conv_info,
+                                unsigned int depth_multiplier)
 {
     // Create reference
     const int   input_offset   = -src.quantization_info().offset;
@@ -150,89 +158,76 @@
     {
         for(int z = 0; z < input_depth; ++z)
         {
-            int32_t bias_val = *static_cast<const int32_t *>(biases(Coordinates(z)));
-            for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+            for(unsigned int m = 0; m < depth_multiplier; ++m)
             {
-                for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
+                const int     out_z    = z * depth_multiplier + m;
+                const int32_t bias_val = *static_cast<const int32_t *>(biases(Coordinates(out_z)));
+
+                for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
                 {
-                    Coordinates coords(x, y, z, r);
-                    int         filter_offset = filter_plane * z;
-
-                    int32_t val = 0;
-                    for(int j = y - filter_half_height; j <= (y + filter_half_height); ++j)
+                    for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
                     {
-                        for(int i = x - filter_half_width; i <= (x + filter_half_width); ++i)
-                        {
-                            coords.set(0, i);
-                            coords.set(1, j);
-                            const auto    in_val = tensor_elem_at<uint8_t>(src, coords, BorderMode::CONSTANT, -input_offset);
-                            const uint8_t w_val  = *(weights.data() + filter_offset);
-                            val += (in_val + input_offset) * (w_val + weights_offset);
-                            ++filter_offset;
-                        }
-                    }
-                    val += bias_val;
-                    val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift);
-                    val += output_offset;
-                    val = std::max<int32_t>(val, 0);
-                    val = std::min<int32_t>(val, 255);
+                        Coordinates coords(x, y, z, r);
+                        int         filter_offset = filter_plane * out_z;
 
-                    // Store the result
-                    dst[out_pos++] = val;
+                        int32_t val = 0;
+                        for(int j = y - filter_half_height; j <= (y + filter_half_height); ++j)
+                        {
+                            for(int i = x - filter_half_width; i <= (x + filter_half_width); ++i)
+                            {
+                                coords.set(0, i);
+                                coords.set(1, j);
+                                const auto    in_val = tensor_elem_at<uint8_t>(src, coords, BorderMode::CONSTANT, -input_offset);
+                                const uint8_t w_val  = *(weights.data() + filter_offset);
+                                val += (in_val + input_offset) * (w_val + weights_offset);
+                                ++filter_offset;
+                            }
+                        }
+                        val += bias_val;
+                        val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift);
+                        val += output_offset;
+                        val = std::max<int32_t>(val, 0);
+                        val = std::min<int32_t>(val, 255);
+
+                        // Store the result
+                        dst[out_pos++] = val;
+                    }
                 }
             }
         }
     }
 }
 
-template <>
-SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
-                                            const PadStrideInfo &conv_info)
+template <typename T, typename TB>
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
+                                      unsigned int depth_multiplier)
 {
-    SimpleTensor<uint8_t> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
 
     if(src.data_layout() == DataLayout::NHWC)
     {
-        SimpleTensor<uint8_t> src_nchw     = reference::permute<uint8_t>(src, PermutationVector(1U, 2U, 0U));
-        SimpleTensor<uint8_t> weights_nchw = reference::permute<uint8_t>(weights, PermutationVector(1U, 2U, 0U));
-        SimpleTensor<uint8_t> dst_nchw     = reference::permute<uint8_t>(dst, PermutationVector(1U, 2U, 0U));
-
-        depthwise_convolution_nchw(src_nchw, weights_nchw, biases, dst_nchw, conv_info);
-
-        return reference::permute<uint8_t>(dst_nchw, PermutationVector(2U, 0U, 1U));
-    }
-
-    depthwise_convolution_nchw(src, weights, biases, dst, conv_info);
-
-    return dst;
-}
-
-template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info)
-{
-    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
-
-    if(src.data_layout() == DataLayout::NHWC && src.data_type() == DataType::F32)
-    {
         SimpleTensor<T> src_nchw     = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
         SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
         SimpleTensor<T> dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
 
-        depthwise_convolution_nchw<T, TB>(src_nchw, weights_nchw, biases, dst_nchw, conv_info);
+        depthwise_convolution_nchw(src_nchw, weights_nchw, biases, dst_nchw, conv_info, depth_multiplier);
 
         return reference::permute<T>(dst_nchw, PermutationVector(2U, 0U, 1U));
     }
 
-    depthwise_convolution_nchw<T, TB>(src, weights, biases, dst, conv_info);
+    depthwise_convolution_nchw(src, weights, biases, dst, conv_info, depth_multiplier);
 
     return dst;
 }
 
+template SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
+                                                     const PadStrideInfo &conv_info, unsigned int depth_multiplier);
+
 template SimpleTensor<float> depthwise_convolution(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &biases, const TensorShape &dst_shape,
-                                                   const PadStrideInfo &conv_info);
+                                                   const PadStrideInfo &conv_info, unsigned int depth_multiplier);
 
 template SimpleTensor<half> depthwise_convolution(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &biases, const TensorShape &dst_shape,
-                                                  const PadStrideInfo &conv_info);
+                                                  const PadStrideInfo &conv_info, unsigned int depth_multiplier);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.h b/tests/validation/reference/DepthwiseConvolutionLayer.h
index df743a5..bab3387 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.h
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,8 @@
 namespace reference
 {
 template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info);
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
+                                      unsigned int depth_multiplier);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
index ca6c168..8bc6ddb 100644
--- a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@
                                                       const SimpleTensor<T> &pointwise_biases, const TensorShape &dst_shape, const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
 {
     // Compute reference
-    SimpleTensor<T> depthwise_out = depthwise_convolution(src, depthwise_weights, depthwise_biases, depthwise_out_shape, depthwise_conv_info);
+    SimpleTensor<T> depthwise_out = depthwise_convolution(src, depthwise_weights, depthwise_biases, depthwise_out_shape, depthwise_conv_info, 1);
     SimpleTensor<T> dst           = convolution_layer(depthwise_out, pointwise_weights, pointwise_biases, dst_shape, pointwise_conv_info);
 
     return dst;