Add support for dilation > 1 in assembly DepthwiseConvolution

* Resolve COMPMID-5689

Change-Id: I81a3791ad054db59562b76d1c729f2b2168aee8b
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Signed-off-by: Andrew Mundy <andrew.mundy@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8919
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index e02998f..c305835 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,6 +79,7 @@
 
   /* Compute a portion of the output tensor with padding. */
   virtual void compute_tile_padded(
+    const DepthwiseArgs &args,
     unsigned int output_i, unsigned int output_j,
     unsigned int output_channel_start, unsigned int output_channel_end,
     const TensorSpec<const TInput *> &input,
@@ -93,6 +94,7 @@
    * variant.
    */
   virtual void compute_row_padded_tile_row(
+    const DepthwiseArgs &args,
     const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
     const unsigned int output_channel_start, const unsigned int output_channel_end,
     const TensorSpec<const TInput *> &input,
@@ -104,6 +106,7 @@
     for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
     {
       this->compute_tile_padded(
+        args,
         output_i, output_j, output_channel_start, output_channel_end,
         input, output, parameters, working_space
       );
@@ -116,6 +119,7 @@
    * variant.
    */
   virtual void compute_tiles_unpadded(
+    const DepthwiseArgs &args,
     unsigned int start_output_i, unsigned int start_output_j,
     unsigned int n_tile_rows, unsigned int n_tile_cols,
     unsigned int output_channel_start, unsigned int output_channel_end,
@@ -131,6 +135,7 @@
       for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
       {
         this->compute_tile_padded(
+            args,
             start_output_i, row_start_output_j,
             output_channel_start, output_channel_end,
             input, output, parameters, working_space
@@ -142,18 +147,12 @@
   }
 
   void execute_internal(
-    unsigned int n_batches,
-    unsigned int input_height,
-    unsigned int input_width,
-    unsigned int n_input_channels,
-    const PaddingValues &padding,
+    const DepthwiseArgs &args,
     const void *input,
     size_t ld_input_col,
     size_t ld_input_row,
     size_t ld_input_batch,
     const void *parameters,
-    unsigned int output_height,
-    unsigned int output_width,
     void *output,
     size_t ld_output_col,
     size_t ld_output_row,
@@ -165,40 +164,40 @@
   {
     // Get and initialise the working space for this thread.
     void *thread_working_space =
-      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(n_input_channels);
-    this->initialise_working_space(thread_working_space, n_input_channels);
+      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(args.input_channels);
+    this->initialise_working_space(thread_working_space, args.input_channels);
 
     // Construct convenient representations of the input/output tensors.
     TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
     TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
 
-    const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
+    const auto n_output_channels = args.input_channels * args.channel_multiplier;
 
-    for (unsigned int batch = 0; batch < n_batches; batch++)
+    for (unsigned int batch = 0; batch < args.n_batches; batch++)
     {
       // Iterate over rows of the output tensor; we stripe over the tiles.
       for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
-           start_output_i < output_height;
+           start_output_i < args.output_rows;
            start_output_i += n_threads * m_strat->get_output_rows())
       {
         // Determine what (if any padding) is required on the top/bottom of
         // this row of the convolution.
         const auto end_output_i = start_output_i + m_strat->get_output_rows();
-        const bool pad_output_bottom = output_height < end_output_i;
+        const bool pad_output_bottom = args.output_rows < end_output_i;
 
-        const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
+        const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
         const bool pad_input_top = start_input_i < 0;
         const int end_input_i = start_input_i + m_strat->get_input_rows();
-        const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
+        const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
         const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
 
         // Iterate over the columns of the output tensor; we attempt to grab as
         // much as possible of the unpadded regions, so the loop structure is a
         // bit odd.
         unsigned int start_output_j = 0;
-        while (start_output_j < output_width)
+        while (start_output_j < args.output_cols)
         {
-          const int start_in_j = start_output_j * this->m_args.stride_cols - padding.left;
+          const int start_in_j = start_output_j * args.stride_cols - args.padding.left;
           const bool pad_input_left = start_in_j < 0;
 
           // Determine if we can process a number of unpadded tiles in one go.
@@ -206,16 +205,16 @@
           if (!pad_input_left)
           {
             // Determine the maximum number of tiles we could handle.
-            n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
+            n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
 
             // Handle padding on the right hand edge
-            const int tile_stride = m_strat->get_output_cols() * this->m_args.stride_cols;
+            const int tile_stride = m_strat->get_output_cols() * args.stride_cols;
             int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
             int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
 
             while (n_unpadded_tiles > 0 &&
-                   (static_cast<int>(output_width) < end_output_j ||
-                    static_cast<int>(input_width) < end_input_j))
+                   (static_cast<int>(args.output_cols) < end_output_j ||
+                    static_cast<int>(args.input_cols) < end_input_j))
             {
               n_unpadded_tiles--;
               end_output_j -= m_strat->get_output_cols();
@@ -230,6 +229,7 @@
             {
               // Completely unpadded execution
               this->compute_tiles_unpadded(
+                args,
                 start_output_i, start_output_j,
                 1, n_unpadded_tiles,  // Compute a row of unpadded tiles
                 0, n_output_channels,  // Compute all channels
@@ -240,6 +240,7 @@
             {
               // Top/bottom padding only
               this->compute_row_padded_tile_row(
+                args,
                 start_output_i, start_output_j, n_unpadded_tiles,
                 0, n_output_channels,  // Compute all channels
                 input_tensor, output_tensor, parameters, thread_working_space
@@ -250,6 +251,7 @@
           else
           {
             this->compute_tile_padded(
+              args,
               start_output_i, start_output_j,
               0, n_output_channels,  // Compute all channels
               input_tensor, output_tensor, parameters, thread_working_space
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
new file mode 100644
index 0000000..c2b8610
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "depthwise_common.hpp"
+
+using arm_gemm::iceildiv;
+
+namespace arm_conv {
+namespace depthwise {
+
+std::tuple<size_t, size_t, size_t, size_t, size_t>
+get_reduced_view_for_dilation(size_t out_size, size_t in_size, const size_t d,
+                              const size_t dilation_factor,
+                              const size_t kernel_size, const size_t stride,
+                              const size_t orig_pad_before) {
+    // Get the valid output range
+    out_size = iceildiv(out_size - d, dilation_factor);
+
+    // Compute the start offset and the amount of padding which applies to this
+    // portion of the work.
+    size_t start_pos = d * stride, pad_before = 0;
+    if (start_pos < orig_pad_before) {
+        pad_before = iceildiv(orig_pad_before - start_pos, dilation_factor);
+    }
+    start_pos += pad_before * dilation_factor - orig_pad_before;
+
+    // Hence compute the valid input range
+    in_size = start_pos < in_size
+                  ? iceildiv(in_size - start_pos, dilation_factor)
+                  : 0;
+
+    // Finally, compute the "after" padding
+    const size_t reqd_input = (out_size - 1) * stride + kernel_size;
+    size_t pad_after = 0;
+    if (reqd_input > (pad_before + in_size)) {
+        pad_after = reqd_input - (pad_before + in_size);
+    }
+
+    return std::make_tuple(out_size, in_size, start_pos, pad_before, pad_after);
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
index 70b1291..2620b48 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -367,6 +367,7 @@
 
   protected:
   void compute_tile_padded(
+    const DepthwiseArgs &args,
     unsigned int output_i, unsigned int output_j,
     unsigned int output_channel_start, unsigned int output_channel_end,
     const TensorSpec<const TInput *> &input,
@@ -379,13 +380,13 @@
     auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
 
     // Compute the input pointer array
-    const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
+    const auto input_channel_start = output_channel_start / args.channel_multiplier;
 
-    const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
     const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
     const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
 
-    const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
     const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
     const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
 
@@ -394,8 +395,8 @@
       input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
       input.ld_row, input.ld_col,
       ws->input_buffer,
-      input_pad_top, this->m_args.input_rows - input_i,
-      input_pad_left, this->m_args.input_cols - input_j
+      input_pad_top, args.input_rows - input_i,
+      input_pad_left, args.input_cols - input_j
     );
 
     // Compute the output pointer array
@@ -404,8 +405,8 @@
       output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
       output.ld_row, output.ld_col,
       ws->output_buffer,
-      0, this->m_args.output_rows - output_i, // Top padding, # valid rows
-      0, this->m_args.output_cols - output_j  // Left padding, # valid columns
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
     );
 
     // Execute the kernel
@@ -416,6 +417,7 @@
   }
 
   void compute_row_padded_tile_row(
+    const DepthwiseArgs &args,
     const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
     const unsigned int output_channel_start, const unsigned int output_channel_end,
     const TensorSpec<const TInput *> &input,
@@ -430,19 +432,19 @@
     const auto os = this->get_output_stage();
 
     // Compute top and bottom padding; hence fill in the initial pointer arrays.
-    const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
-    const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+    const auto input_channel_start = output_channel_start / args.channel_multiplier;
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
     const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
 
     const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
-    const auto input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left;
+    const auto input_j = output_j * args.stride_cols - args.padding.left;
 
     // Valid input rows is the smallest of the input rows that aren't padding for this tile, and the number of rows
     // available.
-    const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, this->m_args.input_rows - input_i);
-    const auto valid_output_rows = std::min(strat->get_output_rows(), this->m_args.output_rows - output_i);
+    const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, args.input_rows - input_i);
+    const auto valid_output_rows = std::min(strat->get_output_rows(), args.output_rows - output_i);
 
-    const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols;
+    const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
     const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
 
     fill_pointer_array<const TInput>(
@@ -450,8 +452,8 @@
       input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
       input.ld_row, input.ld_col,
       ws->input_buffer,
-      input_pad_top, this->m_args.input_rows - input_i,
-      0, this->m_args.input_cols - input_j  // No left padding
+      input_pad_top, args.input_rows - input_i,
+      0, args.input_cols - input_j  // No left padding
     );
 
     fill_pointer_array(
@@ -459,8 +461,8 @@
       output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
       output.ld_row, output.ld_col,
       ws->output_buffer,
-      0, this->m_args.output_rows - output_i,  // Top padding, # valid rows
-      0, this->m_args.output_cols - output_j  // Left padding, # valid columns
+      0, args.output_rows - output_i,  // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
     );
 
     for (; n_tile_cols; n_tile_cols--)
@@ -492,6 +494,7 @@
   }
 
   void compute_tiles_unpadded(
+    const DepthwiseArgs &args,
     unsigned int output_i, const unsigned int output_j,
     unsigned int n_tile_rows, unsigned int n_tile_cols,
     unsigned int output_channel_start, unsigned int output_channel_end,
@@ -511,8 +514,8 @@
       // If the direct kernel is supported, then use it.
       // Compute the base pointers we'll use in the tile.
       auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col;
-      const int start_input_i = output_i * this->m_args.stride_rows - this->m_args.padding.top;
-      const int start_input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left;
+      const int start_input_i = output_i * args.stride_rows - args.padding.top;
+      const int start_input_j = output_j * args.stride_cols - args.padding.left;
       auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col;
 
       // Execute the kernel
@@ -528,10 +531,10 @@
     {
       // Otherwise, we repeatedly call the padded kernel but use our knowledge
       // of the tensor structure to avoid recomputing the pointer array.
-      const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
+      const auto input_channel_start = output_channel_start / args.channel_multiplier;
 
       const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols();
-      const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols;
+      const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
       const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols();
       const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
 
@@ -539,16 +542,16 @@
       // each subsequent tile we simply update the pointers.
       for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
       {
-        const int input_i = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
-        const int input_j = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+        const int input_i = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+        const int input_j = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
 
         fill_pointer_array<const TInput>(
           ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
           input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
           input.ld_row, input.ld_col,
           ws->input_buffer,
-          0, this->m_args.input_rows,
-          0, this->m_args.input_cols
+          0, args.input_rows,
+          0, args.input_cols
         );
 
         // Compute the output pointer array
@@ -557,8 +560,8 @@
           output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
           output.ld_row, output.ld_col,
           ws->output_buffer,
-          0, this->m_args.output_rows,
-          0, this->m_args.output_cols
+          0, args.output_rows,
+          0, args.output_cols
         );
 
         for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
index 9f53f7c..b058ce2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -248,6 +248,7 @@
 
   protected:
   void compute_tile_padded(
+    const DepthwiseArgs &args,
     unsigned int output_i, unsigned int output_j,
     unsigned int channel_start, unsigned int channel_end,
     const TensorSpec<const TInput *> &input,
@@ -259,24 +260,24 @@
     // Get the working space
     WorkingSpace *ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
 
-    const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
     const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
     const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
 
-    const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
     const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
     const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
 
     fill_pointer_array_generic_kernel<const TInput>(
       ws->inptr_array,
       this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
-      this->m_args.kernel_rows, this->m_args.kernel_cols,
-      this->m_args.stride_rows, this->m_args.stride_cols,
+      args.kernel_rows, args.kernel_cols,
+      args.stride_rows, args.stride_cols,
       input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
       input.ld_row, input.ld_col,
       ws->input_buffer,
-      input_pad_top, this->m_args.input_rows - input_i,
-      input_pad_left, this->m_args.input_cols - input_j
+      input_pad_top, args.input_rows - input_i,
+      input_pad_left, args.input_cols - input_j
     );
 
     // Compute the output pointer array
@@ -285,15 +286,15 @@
       output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
       output.ld_row, output.ld_col,
       ws->output_buffer,
-      0, this->m_args.output_rows - output_i, // Top padding, # valid rows
-      0, this->m_args.output_cols - output_j  // Left padding, # valid columns
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
     );
 
     // Execute the kernel
     DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute(
       reinterpret_cast<const StratType *>(this->m_strat.get()), ws,
       this->get_output_stage(), m_bias, parameters,
-      this->m_args.kernel_rows * this->m_args.kernel_cols,
+      args.kernel_rows * args.kernel_cols,
       channel_end - channel_start
     );
   }
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
index e58467b..cef568f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -525,6 +525,7 @@
   }
 
   void compute_tile_padded(
+    const DepthwiseArgs &args,
     unsigned int output_i, unsigned int output_j,
     unsigned int output_channel_start, unsigned int output_channel_end,
     const TensorSpec<const TInput *> &input,
@@ -536,11 +537,11 @@
     // Get the working space
     auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
 
-    const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+    const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
     const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
     const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
 
-    const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+    const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
     const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
     const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
 
@@ -551,40 +552,40 @@
       output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
       output.ld_row, output.ld_col,
       ws->output_buffer,
-      0, this->m_args.output_rows - output_i, // Top padding, # valid rows
-      0, this->m_args.output_cols - output_j  // Left padding, # valid columns
+      0, args.output_rows - output_i, // Top padding, # valid rows
+      0, args.output_cols - output_j  // Left padding, # valid columns
     );
 
     // Compute the parameter stride
-    DepthwiseArgs single_iter(this->m_args);
+    DepthwiseArgs single_iter(args);
     single_iter.input_channels = 1;
     const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
       ->get_storage_size(single_iter);
 
     for (; output_channel_start < output_channel_end;
-         output_channel_start += this->m_args.channel_multiplier)
+         output_channel_start += args.channel_multiplier)
     {
       // Compute the input pointer array
-      const auto input_channel = output_channel_start / this->m_args.channel_multiplier;
+      const auto input_channel = output_channel_start / args.channel_multiplier;
 
       // Construct the input patch
       depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
-        this->m_args, ws, this->m_strat.get(),
+        args, ws, this->m_strat.get(),
         input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
-        input_pad_top, this->m_args.input_rows - input_i,
-        input_pad_left, this->m_args.input_cols - input_j
+        input_pad_top, args.input_rows - input_i,
+        input_pad_left, args.input_cols - input_j
       );
 
       // Execute the kernel
       depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
-        this->m_args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
+        args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
         parameters, m_bias
       );
 
       // Update the output pointers
       for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
       {
-        ws->outptr_array[n] += this->m_args.channel_multiplier;
+        ws->outptr_array[n] += args.channel_multiplier;
       }
 
       // Progress the parameters
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
index 1ee19e5..0f91fe3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -55,7 +55,9 @@
 
   DepthwiseCommon<TInput, TWeight, TOutput> *get_instance(const DepthwiseArgs &args, const OutputStage &os) const
   {
-    return initialise(args, os);
+    auto impl = initialise(args, os);
+    impl->set_name(std::string(name));
+    return impl;
   }
 };
 
@@ -136,14 +138,7 @@
 {
   const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
   const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
-
-  if(success)
-  {
-        auto i =  impl->get_instance(args, os);
-        i->set_name(impl->name);
-        return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(i);
-  }
-  return nullptr;
+  return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
 }
 
 }  // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
index f3160fb..2b2e6f3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -332,18 +332,12 @@
   }
 
   void execute_internal(
-    unsigned int batches,
-    unsigned int input_height,
-    unsigned int input_width,
-    unsigned int n_input_channels,
-    const PaddingValues &padding,
+    const DepthwiseArgs &args,
     const void *input,
     size_t ld_input_col,
     size_t ld_input_row,
     size_t ld_input_batch,
     const void *parameters,
-    unsigned int output_height,
-    unsigned int output_width,
     void *output,
     size_t ld_output_col,
     size_t ld_output_row,
@@ -359,7 +353,7 @@
     this->initialise_working_space(thread_working_space);
     auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
 
-    const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
+    const auto n_output_channels = args.input_channels * args.channel_multiplier;
     const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
 
     // Get typed pointers
@@ -368,23 +362,23 @@
     auto weights = reinterpret_cast<const TWeight *>(parameters);
 
     // Iterate over batches
-    for (; batches; batches--)
+    for (auto batches = args.n_batches; batches; batches--)
     {
       // NOTE: Other loop orderings are possible and it would be worth
       // investigating them.
 
       // Within a batch, stripe threads across rows.
       for (auto start_output_i = thread_id * m_strat->get_output_rows();
-           start_output_i < output_height;
+           start_output_i < args.output_rows;
            start_output_i += n_threads * m_strat->get_output_rows())
       {
         // Determine what (if any padding) is required on the top/bottom of
         // this row of the convolution.
-        const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
+        const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
         const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
         const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
-        const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i;
-        const unsigned int valid_output_rows = output_height - start_output_i;
+        const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
+        const unsigned int valid_output_rows = args.output_rows - start_output_i;
 
         auto inptr_row = input_batch + input_i*ld_input_row;
         auto outptr_row = output_batch + start_output_i * ld_output_row;
@@ -392,10 +386,10 @@
         // Execute the kernel
         this->execute_kernel(
           inptr_row, ld_input_row, ld_input_col, vl,
-          input_pad_top, valid_input_rows, padding.left, input_width,
+          input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
           weights, this->m_bias,
           outptr_row, ld_output_row, ld_output_col, vl,
-          valid_output_rows, output_width,
+          valid_output_rows, args.output_cols,
           0 /* first channel */, n_output_channels,
           ws
         );
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index 3998dfb..8eb278c 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,8 @@
     std::string     filter = "";
 
     DepthwiseConfig(DepthwiseMethod method)
-        : method(method) {};
-    DepthwiseConfig() {};
+        : method(method){};
+    DepthwiseConfig(){};
 };
 
 struct DepthwiseArgs
@@ -48,6 +48,7 @@
 
     unsigned int kernel_rows, kernel_cols;
     unsigned int stride_rows, stride_cols;
+    unsigned int dilation_rows, dilation_cols;
 
     unsigned int n_batches, input_rows, input_cols, input_channels;
     unsigned int output_rows, output_cols;
@@ -65,14 +66,48 @@
         const CPUInfo *cpu_info,
         unsigned int kernel_rows, unsigned int kernel_cols,
         unsigned int stride_rows, unsigned int stride_cols,
+        unsigned int dilation_rows, unsigned int dilation_cols,
+        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
+        unsigned int input_channels,
+        unsigned int output_rows, unsigned int output_cols,
+        unsigned int  channel_multiplier,
+        PaddingValues padding, arm_gemm::Activation activation,
+
+        const DepthwiseConfig *config)
+        : cpu_info(cpu_info),
+          kernel_rows(kernel_rows),
+          kernel_cols(kernel_cols),
+          stride_rows(stride_rows),
+          stride_cols(stride_cols),
+          dilation_rows(dilation_rows),
+          dilation_cols(dilation_cols),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          input_channels(input_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          channel_multiplier(channel_multiplier),
+          padding(padding),
+          activation(activation),
+          config(config)
+    {
+    }
+
+    DepthwiseArgs(
+        const CPUInfo *cpu_info,
+        unsigned int kernel_rows, unsigned int kernel_cols,
+        unsigned int stride_rows, unsigned int stride_cols,
         unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
         unsigned int input_channels,
         unsigned int output_rows, unsigned int output_cols,
         unsigned int  channel_multiplier,
         PaddingValues padding, arm_gemm::Activation activation,
         const DepthwiseConfig *config)
-        : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), stride_rows(stride_rows), stride_cols(stride_cols), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
-          input_channels(input_channels), output_rows(output_rows), output_cols(output_cols), channel_multiplier(channel_multiplier), padding(padding), activation(activation), config(config)
+        : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
+                        stride_cols, 1, 1, n_batches, input_rows, input_cols,
+                        input_channels, output_rows, output_cols,
+                        channel_multiplier, padding, activation, config)
     {
     }
 };
@@ -80,28 +115,30 @@
 template <typename TInput, typename TWeight, typename TOutput>
 class DepthwiseCommon : public IDepthwiseCommon
 {
-private:
-    std::string _name{};
-
-protected:
+    protected:
     const DepthwiseArgs m_args; // Copy of arguments
+    std::string         m_name{};
 
-public:
-    std::string name() const
-    {
-        return _name;
-    }
-
-    void set_name(const std::string &n)
-    {
-        _name = n;
-    }
-
+    public:
     DepthwiseCommon(const DepthwiseArgs &args)
-        : m_args(args) {};
-    DepthwiseCommon(DepthwiseCommon &) = delete;
+        : m_args(args){};
+    DepthwiseCommon(DepthwiseCommon &)            = delete;
     DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
 
+    std::string name() const override
+    {
+        return m_name;
+    }
+
+    void set_name(std::string name)
+    {
+        // Only allow the name to be set once
+        if (m_name.empty())
+        {
+            m_name = name;
+        }
+    }
+
     void execute(
         const void *const  input,
         const void *const  parameters,
@@ -168,34 +205,77 @@
         unsigned int         thread_id,
         unsigned int         n_threads) const override final
     {
-        this->execute_internal(
-            batches, input_height, input_width, channels, padding, input,
-            ld_input_col, ld_input_row, ld_input_batch, parameters, output_height,
-            output_width, output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        // Construct a new set of arguments to reflect that we might have been
+        // passed different input/output tensors. Dilation is handled at this
+        // level; so we set the dilation in the arguments to zero.
+        DepthwiseArgs args(this->m_args);
+        args.n_batches = batches;
+        args.input_rows = input_height;
+        args.input_cols = input_width;
+        args.input_channels = channels;
+        args.output_rows = output_height;
+        args.output_cols = output_width;
+        args.padding = padding;
+        args.dilation_rows = args.dilation_cols = 1;
+
+        auto ld_input_col_d = ld_input_col * m_args.dilation_cols;
+        auto ld_input_row_d = ld_input_row * m_args.dilation_rows;
+        auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
+        auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
+
+        for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
+        {
+            size_t start_i;
+            std::tie(args.output_rows, args.input_rows, start_i,
+                     args.padding.top, args.padding.bottom) =
+                get_reduced_view_for_dilation(
+                        output_height, input_height, drow, m_args.dilation_rows,
+                        m_args.kernel_rows, m_args.stride_rows, padding.top);
+
+            auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row;
+            auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
+
+            if (args.output_rows)
+            {
+                for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+                {
+                    size_t start_j;
+                    std::tie(args.output_cols, args.input_cols, start_j,
+                             args.padding.left, args.padding.right) =
+                        get_reduced_view_for_dilation(
+                                output_width, input_width, dcol, m_args.dilation_cols,
+                                m_args.kernel_cols, m_args.stride_cols, padding.left);
+
+                    const TInput *input_col = input_row + start_j * ld_input_col;
+                    TOutput *output_col = output_row + dcol * ld_output_col;
+
+                    if (args.output_cols)
+                    {
+                        this->execute_internal(
+                            args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
+                            output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
+                            working_space, thread_id, n_threads);
+                    }
+                }
+            }
+        }
     }
 
-protected:
+    protected:
     virtual void execute_internal(
-        unsigned int batches,
-        unsigned int input_height,
-        unsigned int input_width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        unsigned int output_height,
-        unsigned int output_width,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
+        const DepthwiseArgs &instance_args,
+        const void          *input,
+        size_t               ld_input_col,
+        size_t               ld_input_row,
+        size_t               ld_input_batch,
+        const void          *parameters,
+        void                *output,
+        size_t               ld_output_col,
+        size_t               ld_output_row,
+        size_t               ld_output_batch,
+        void                *working_space,
+        unsigned int         thread_id,
+        unsigned int         n_threads) const = 0;
 };
 
 template <typename TInput, typename TWeight = TInput, typename TOutput = TInput>
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index 52963ab..fea6326 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 
 #include "arm_gemm.hpp"
 #include "common.hpp"
+#include <cstddef>
+#include <tuple>
 
 namespace arm_conv
 {
@@ -64,6 +66,9 @@
 public:
     virtual ~IDepthwiseCommon() = default;
 
+    // Get the name of the depthwise implementation
+    virtual std::string name() const = 0;
+
     // Determine the amount of storage space required for the rearranged weights
     // and bias.
     virtual size_t get_storage_size(void) const = 0;
@@ -127,5 +132,25 @@
         unsigned int n_threads) const = 0;
 };
 
+// To handle a dilation factor of D execute the kernel once for each d in
+// [0..D). Each `d` corresponds to a portion or "view" of the input and output
+// tensors. The output view corresponds to every Dth pixel starting from `d`;
+// this function computes how many pixels are covered. The input view consists
+// of an amount of before padding, every Dth pixel starting from an offset, and
+// some after padding.  This function computes the start padding, input offset,
+// number of valid input pixels, and the after padding.
+//
+// Returns
+// - Number of valid output pixels corresponding to `d`
+// - Number of valid input pixels corresponding to `d`
+// - Offset of the first pixel corresponding to `d`
+// - Amount of padding in the view for `d`
+std::tuple<size_t, size_t, size_t, size_t, size_t>
+get_reduced_view_for_dilation(
+    size_t out_size, size_t in_size,
+    size_t d, size_t dilation_factor,
+    size_t kernel_size, size_t stride,
+    size_t pad_before);
+
 } // namespace depthwise
 } // namespace arm_conv