Update Neon™ depthwise kernel

- Reduce duplication and simplify overall structure.
- Improve multi-threaded performance by sharing more data
  in lower-level caches.

Partially Resolves: COMPMID-5054
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: Iac747f39b21c540122fa75218762631c4d787911
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7449
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Andrew Mundy
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
new file mode 100644
index 0000000..ff5098d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthfirst_driver.hpp"
+#include "interleaves/generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename OutputStage>
+class IPlanarStrategy
+{
+  public:
+  virtual ~IPlanarStrategy() = default;
+  virtual unsigned int get_output_rows(void) const = 0;
+  virtual arm_gemm::VLType get_vl_type(void) const = 0;
+
+  virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
+  virtual void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+          typename OutputStage>
+struct PlanarKernelType;
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+  using Type = std::function<void(
+    const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *, const TAccum *,
+    TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+    unsigned int start_channels, unsigned int valid_channels,
+    TAccum act_min, TAccum act_max
+  )>;
+
+  template <typename WorkspaceType>
+  static inline void execute(
+    const Type fn,
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const TAccum *bias,
+    TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
+    unsigned int start_channel, unsigned int valid_channels,
+    const Nothing &, const WorkspaceType *ws
+  )
+  {
+    fn(
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows,
+      pad_left, valid_input_cols,
+      weights, bias,
+      outptrs, outlds, outvllds, output_cols,
+      start_channel, valid_channels,
+      ws->activation_min, ws->activation_max
+    );
+  }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+  using Type = std::function<void(
+    const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *,
+    TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+    unsigned int start_channel, unsigned int valid_channels,
+    const arm_gemm::Requantize32 &
+  )>;
+
+  template <typename WorkspaceType>
+  static inline void execute(
+    const Type fn,
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const int32_t *,
+    TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
+    unsigned int first_channel, unsigned int valid_channels,
+    const arm_gemm::Requantize32 &qp, const WorkspaceType *
+  )
+  {
+    fn(
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows,
+      pad_left, valid_input_cols,
+      weights,
+      outptrs, outlds, outldvls, output_cols,
+      first_channel, valid_channels,
+      qp
+    );
+  }
+};
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TOutput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class PlanarStrategy : public IPlanarStrategy<OutputStage>
+{
+  unsigned int m_kernel_rows, m_kernel_cols;
+  unsigned int m_stride_rows, m_stride_cols;
+  unsigned int m_output_rows;
+  arm_gemm::VLType m_vl_type;
+
+  protected:
+  virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+  {
+    // Get the kernel point to pack at the given index; return false to
+    // indicate that this index (and all greater indices) is out of range.
+    if (m_kernel_rows * m_kernel_cols <= index)
+      return false;
+
+    y = index % m_kernel_cols;
+    x = index / m_kernel_cols;
+    return true;
+  }
+
+  virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
+  {
+    return interleaves::PackingArguments(
+      m_kernel_rows, m_kernel_cols, sizeof(TWeight),
+      false, sizeof(TAccum),  // Don't pack the bias
+      m_vl_type, sizeof(TAccum), 1,  // Accumulator depth of 1 TODO
+      [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+      { return this->get_kernel_packing_point(idx, x, y); }
+    );
+  }
+
+  public:
+  PlanarStrategy(
+    unsigned int kernel_rows, unsigned int kernel_cols,
+    unsigned int stride_rows, unsigned int stride_cols,
+    unsigned int output_rows,
+    arm_gemm::VLType vl_type
+  ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+      m_stride_rows(stride_rows), m_stride_cols(stride_cols),
+      m_output_rows(output_rows), m_vl_type(vl_type)
+  {
+  }
+
+  unsigned int get_output_rows(void) const override { return m_output_rows; }
+  arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
+
+  size_t get_storage_size(const DepthwiseArgs &args) const override
+  {
+    return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
+  }
+
+  void pack_parameters(
+    const DepthwiseArgs &args, void *buffer,
+    const void *biases, const OutputStage &,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) const override
+  {
+    interleaves::pack_parameters_generic(
+      this->get_kernel_packing_arguments(), args,
+      buffer, biases, weights, ld_weight_col, ld_weight_row
+    );
+  }
+
+  using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+  virtual KernelType get_kernel(void) const = 0;
+};
+
+
+namespace {
+
+template <typename T>
+struct OutputRowPtrsElement
+{
+  struct Workspace
+  {
+    T **output_row_ptrs;
+    size_t *output_ld_cols;
+    size_t *output_ld_vls;  // Stride between vectors of channels
+    T *output_padding_buffer;
+  };
+
+  template <typename OutputStage>
+  static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+  {
+    // We need one pointer and stride for each row of output, and an additional
+    // blob of memory into which padded stores can go.
+    return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
+           get_vector_length<char>(args.strategy->get_vl_type());
+  }
+
+  template <typename WorkspaceType, typename OutputStage>
+  static void *initialise(WorkspaceType *ws, void *buffer,
+                          const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+  {
+    const auto n_rows = args.strategy->get_output_rows();
+    ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
+    ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
+    ws->output_ld_vls = ws->output_ld_cols + n_rows;
+    ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
+    return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
+  }
+};
+
+}  // namespace {anonymous}
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+          typename TAccum=typename DefaultTAccum<TOutput>::Type,
+          typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+  using StrategyType = IPlanarStrategy<OutputStage>;
+  using WorkspaceManager = Workspace<
+    OutputRowPtrsElement<TOutput>,
+    ActivationsElement<TAccum, OutputStage>
+  >;
+  using WorkspaceType = typename WorkspaceManager::WorkspaceType;
+
+  std::unique_ptr<StrategyType> m_strat;
+  const TAccum *m_bias;
+  OutputStage m_os;
+
+  public:
+  DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+  : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
+  {
+  }
+
+  size_t get_storage_size(void) const override
+  {
+    return m_strat->get_storage_size(this->m_args);
+  }
+
+  void pack_parameters(
+    void *buffer, const void *biases,
+    const void *weights, size_t ld_weight_col, size_t ld_weight_row
+  ) override
+  {
+    m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
+    this->m_bias = reinterpret_cast<const TAccum *>(biases);
+    depthwise_depthfirst::stash_bias(this->m_os, biases);
+  }
+
+  size_t get_working_size(unsigned int n_threads, unsigned int) const override
+  {
+    return this->get_working_size_per_thread() * n_threads;
+  }
+
+  protected:
+  /* Compute the amount of working space required for a single thread. */
+  virtual size_t get_working_size_per_thread(void) const
+  {
+    return WorkspaceManager::get_sizeof_workspace(
+      WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
+  }
+
+  /* Initialise the working space for a thread. */
+  virtual void initialise_working_space(void *buffer) const
+  {
+    WorkspaceManager::initialise(
+      buffer,
+      WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
+    );
+  }
+
+  /* Execute the kernel for a given chunk of work. */
+  virtual void execute_kernel(
+    const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+    unsigned int pad_top, unsigned int valid_input_rows,
+    unsigned int pad_left, unsigned int valid_input_cols,
+    const TWeight *weights, const TAccum *bias,
+    TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
+    unsigned int valid_output_rows, unsigned int valid_output_cols,
+    unsigned int first_channel, unsigned int valid_channels,
+    WorkspaceType *ws
+  ) const
+  {
+    // Initialise the output pointers
+    for (auto i = 0u; i < m_strat->get_output_rows(); i++)
+    {
+      // Point at the output tensor for all valid rows; otherwise point at the
+      // padding buffer.
+      ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
+      ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
+      ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
+      outptr += ld_out_row;
+    }
+
+    // Execute the kernel
+    PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
+      reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
+      inptr, ld_in_row, ld_in_col, ld_in_vl,
+      pad_top, valid_input_rows, pad_left, valid_input_cols,
+      weights, bias,
+      ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
+      valid_output_cols, first_channel, valid_channels,
+      this->m_os, ws
+    );
+  }
+
+  void execute_internal(
+    unsigned int batches,
+    unsigned int input_height,
+    unsigned int input_width,
+    unsigned int n_input_channels,
+    const PaddingValues &padding,
+    const void *input,
+    size_t ld_input_col,
+    size_t ld_input_row,
+    size_t ld_input_batch,
+    const void *parameters,
+    unsigned int output_height,
+    unsigned int output_width,
+    void *output,
+    size_t ld_output_col,
+    size_t ld_output_row,
+    size_t ld_output_batch,
+    void *working_space,
+    unsigned int thread_id,
+    unsigned int n_threads
+  ) const override
+  {
+    // Get and initialise the working space for this thread.
+    void *thread_working_space =
+      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+    this->initialise_working_space(thread_working_space);
+    auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
+
+    const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
+    const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
+
+    // Get typed pointers
+    auto input_batch = reinterpret_cast<const TInput *>(input);
+    auto output_batch = reinterpret_cast<TOutput *>(output);
+    auto weights = reinterpret_cast<const TWeight *>(parameters);
+
+    // Iterate over batches
+    for (; batches; batches--)
+    {
+      // NOTE: Other loop orderings are possible and it would be worth
+      // investigating them.
+
+      // Within a batch, stripe threads across rows.
+      for (auto start_output_i = thread_id * m_strat->get_output_rows();
+           start_output_i < output_height;
+           start_output_i += n_threads * m_strat->get_output_rows())
+      {
+        // Determine what (if any padding) is required on the top/bottom of
+        // this row of the convolution.
+        const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
+        const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
+        const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
+        const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i;
+        const unsigned int valid_output_rows = output_height - start_output_i;
+
+        auto inptr_row = input_batch + input_i*ld_input_row;
+        auto outptr_row = output_batch + start_output_i * ld_output_row;
+
+        // Execute the kernel
+        this->execute_kernel(
+          inptr_row, ld_input_row, ld_input_col, vl,
+          input_pad_top, valid_input_rows, padding.left, input_width,
+          weights, this->m_bias,
+          outptr_row, ld_output_row, ld_output_col, vl,
+          valid_output_rows, output_width,
+          0 /* first channel */, n_output_channels,
+          ws
+        );
+      }
+
+      // Update the input and output pointers to account for batch
+      input_batch += ld_input_batch;
+      output_batch += ld_output_batch;
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv