Integrate improved CPU depthwise convolution kernels * Replace assembly kernels for depthwise convolution with more optimized ones. * Add int8 assembly kernels. * Fix implicit padding on optimized kernels Resolves: COMPMID-3867, COMPMID-4361 Change-Id: I0b0867e05f61be4f368f62190d55e14d0ab3ebf2 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5622 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>

commit: d02d5edfa15ba6c04a9986a8a362a945cb38ac31 [log] [tgz]
author: Michele Di Giorgio <michele.digiorgio@arm.com> Fri Jan 22 09:47:04 2021 +0000
committer: Michele Di Giorgio <michele.digiorgio@arm.com> Fri Jun 18 10:33:48 2021 +0000
tree: ced4f49691d6c7038e347a8709b315bff59c64cf
parent: b014c27ba6db9840e4a72519760d51a87a2af7e7 [diff]
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
new file mode 100644
index 0000000..fe635d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp

@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class strategy>
+class DepthwiseDepthfirst : public DepthwiseCommon<typename strategy::input_type,
+                                                   typename strategy::weight_type,
+                                                   typename strategy::return_type>
+{
+  using TInput = typename strategy::input_type;
+  using TWeight = typename strategy::weight_type;
+  using TOutput = typename strategy::return_type;
+  using TAccum = typename strategy::bias_type;
+
+  size_t sizeof_input_buffer(unsigned int n_input_channels) const
+  {
+    return sizeof(TInput) * n_input_channels;
+  }
+
+  size_t sizeof_output_buffer(unsigned int n_output_channels) const
+  {
+    return sizeof(TOutput) * n_output_channels;
+  }
+
+  public:
+
+  DepthwiseDepthfirst(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+  {
+  }
+
+  DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete;
+  DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    // TODO What if we insert extra padding? Biases are a different size to the inputs, ...
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl);
+    return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
+  }
+
+  void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    // TODO What if the kernel needs a different packing function?
+
+    // Cast the pointers
+    uint8_t *buffer = static_cast<uint8_t *>(_buffer);
+    const TAccum *biases = static_cast<const TAccum *>(_biases);
+    const TWeight *const weights = static_cast<const TWeight *>(_weights);
+
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
+    ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col;
+    ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+    for (unsigned int n = 0; n < this->m_args.input_channels; n += vl)
+    {
+      const unsigned int todo = std::min(vl, this->m_args.input_channels - n);
+
+      // Copy across the correct amount of bias (or 0)
+      for (unsigned int i = 0; i < todo; i++)
+      {
+        reinterpret_cast<TAccum *>(buffer)[i] = (biases == nullptr) ? 0 : biases[n + i];
+      }
+      buffer += vl * sizeof(TAccum);
+
+      // Copy each of the weights in turn
+      auto weights_row = weights + n;
+      for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
+      {
+        auto weights_col = weights_row;
+
+        for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
+        {
+          for (unsigned int m = 0; m < todo; m++)
+          {
+            reinterpret_cast<TWeight *>(buffer)[m] = weights_col[m];
+          }
+          buffer += vl * sizeof(TWeight);
+
+          weights_col += ld_weight_col;
+        }
+
+        weights_row += ld_weight_row;
+      }
+    }
+  }
+
+  size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+  {
+    const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+    return n_threads * (sizeof_output_buffer(n_output_channels) + sizeof_input_buffer(n_channels));
+  }
+
+  using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
+  void execute(
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const override
+  {
+    strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+
+    // Compute activation values
+    TAccum activation_min, activation_max;
+    if (std::numeric_limits<TAccum>::is_integer)
+    {
+      activation_min = std::numeric_limits<TAccum>::min();
+      activation_max = std::numeric_limits<TAccum>::max();
+    }
+    else
+    {
+      activation_min = static_cast<TAccum>(-std::numeric_limits<float>::infinity());
+      activation_max = static_cast<TAccum>(std::numeric_limits<float>::infinity());
+    }
+
+    switch (this->m_args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        activation_max = static_cast<TAccum>(this->m_args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        activation_min = static_cast<TAccum>(0);
+        break;
+      default:
+        break;
+    }
+
+    // Determine what portion of the work to do.
+    const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+    const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+    const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+    // Cast input and output pointers into the right types
+    const TInput *const inptr = static_cast<const TInput *>(_input);
+    TOutput *const outptr = static_cast<TOutput *>(_output);
+
+    // Create an array for the input pointers
+    const TInput * _inptr_array[strategy::input_rows * strategy::input_cols];
+    const TInput **const inptr_array = _inptr_array;
+
+    // Create an array for the output pointers
+    TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
+    TOutput **const outptr_array = _outptr_array;
+
+    // Allocate portions of the working space
+    uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + get_working_size(thread_id, input_channels);
+    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+    TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer(input_channels * this->m_args.channel_multiplier));
+
+    // Initialise the input buffer
+    for (unsigned int c = 0; c < input_channels; c++)
+    {
+      input_buffer[c] = static_cast<TInput>(0);
+    }
+
+    // For each output tile, construct the requisite set of pointers and call
+    // into the kernel.
+    for (unsigned int batch = 0; batch < batches; batch++)
+    {
+      // Get batch pointers
+      const auto inptr_batch = inptr + batch * ld_input_batch;
+      const auto outptr_batch = outptr + batch * ld_output_batch;
+
+      for (int start_out_i = start_out_height;
+           start_out_i < end_out_height;
+           start_out_i += static_cast<int>(strategy::output_rows))
+      {
+        const int end_out_i = start_out_i + strategy::output_rows;
+        const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
+        const int end_in_i = start_in_i + strategy::input_rows;
+
+        // Compute top/bottom padding
+        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+        const unsigned int valid_output_rows = std::min(
+          end_out_i - start_out_i,
+          static_cast<int>(output_height) - start_out_i
+        );
+
+        // Fill the input pointer array with padding values
+        for (auto index = 0u; index < strategy::input_rows * strategy::input_cols; index++)
+        {
+          inptr_array[index] = input_buffer;
+        }
+
+        for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+        {
+          const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
+          const int pad_left = -std::min(0, start_in_j);
+
+          // Compute how many output tiles we can compute with the direct kernel.
+          int n_direct_tiles = 0;
+          if (!pad_top && !pad_bottom && !pad_left)
+          {
+            // Determine the maximum number of tiles we could handle.
+            n_direct_tiles = (output_width - start_out_j) / strategy::output_cols;
+
+            // Continue to reduce this number as required to avoid reading
+            // padding on the right edge.
+            int end_in_j = start_in_j + n_direct_tiles * strategy::input_cols;
+            int pad_right = std::max(0, end_in_j - static_cast<int>(input_width));
+
+            while (pad_right && n_direct_tiles)
+            {
+              n_direct_tiles--;
+              end_in_j -= strategy::input_cols;
+              pad_right = std::max(0, end_in_j - static_cast<int>(input_width));
+            }
+          }
+
+          // Use the unpadded kernel if we can, otherwise use the padded one.
+          if (n_direct_tiles)
+          {
+            auto inptr = inptr_batch + start_in_i*ld_input_row + start_in_j*ld_input_col;
+            auto outptr = outptr_batch + start_out_i*ld_output_row + start_out_j*ld_output_col;
+            start_out_j += n_direct_tiles*strategy::output_cols;
+
+#ifdef CYCLE_PROFILING
+            auto p = prof.ScopedProfiler(PROFILE_KERNEL, 0);
+#endif
+            strat.direct_kernel(1, n_direct_tiles,
+                                inptr, ld_input_row, ld_input_col,
+                                outptr, ld_output_row, ld_output_col,
+                                parameters, this->m_args.input_channels,
+                                activation_min, activation_max);
+            continue;
+          }
+
+          const int end_out_j = start_out_j + strategy::output_cols;
+          const int end_in_j = start_in_j + strategy::input_cols;
+
+          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+          const unsigned int valid_output_cols = std::min(
+            end_out_j - start_out_j,
+            static_cast<int>(output_width) - start_out_j
+          );
+
+          // Construct the input pointer array - fill the array with pointers to
+          // the input buffer and then fill in the required values.
+          for (auto i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+          {
+            // Can skip over the left padding because we will have either the
+            // same or less than the previous tile.
+            unsigned int j = pad_left;
+            const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
+            const TInput **ptrs = inptr_array + i * strategy::input_cols + j;
+            for (; j < strategy::input_cols - pad_right; j++)
+            {
+              *(ptrs++) = colptr;
+              colptr += ld_input_col;
+            }
+            for (; j < strategy::input_cols; j++)
+            {
+              *(ptrs++) = input_buffer;
+            }
+          }
+
+          // Construct the output pointer array.
+          TOutput **outptr_pos = outptr_array;
+          for (auto i = 0u; i < valid_output_rows; i++)
+          {
+            unsigned int j = 0u;
+            TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+            for (; j < valid_output_cols; j++)
+            {
+              *(outptr_pos++) = colptr;
+               colptr += ld_output_col;
+            }
+            for (; j < strategy::output_cols; j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+          for (auto i = valid_output_rows; i < strategy::output_rows; i++)
+          {
+            for (auto j = 0u; j < strategy::output_cols; j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+
+          start_out_j += strategy::output_cols;
+
+#ifdef CYCLE_PROFILING
+          // TODO Work number
+          auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(0));
+#endif
+          strat.indirect_kernel(inptr_array, outptr_array, parameters,
+                                this->m_args.input_channels, activation_min, activation_max);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
new file mode 100644
index 0000000..29f37c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp

@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class Strategy, unsigned OutputRows, unsigned int OutputCols>
+class DepthwiseDepthfirstGenericBase :
+  public DepthwiseCommon<typename Strategy::input_type,
+                         typename Strategy::weight_type,
+                         typename Strategy::return_type>
+{
+  protected:
+
+  using TInput = typename Strategy::input_type;
+  using TWeight = typename Strategy::weight_type;
+  using TOutput = typename Strategy::return_type;
+  using TAccum = typename Strategy::bias_type;
+
+  size_t sizeof_input_ptr_array(void) const
+  {
+    return sizeof(TInput *) * this->m_args.kernel_rows * this->m_args.kernel_cols * Strategy::n_output_points;
+  }
+
+  size_t sizeof_input_buffer(unsigned int n_channels) const
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(Strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+    return sizeof(TInput) * rounded_channels;
+  }
+
+  size_t sizeof_output_buffer(unsigned int n_channels) const
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(Strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+    return sizeof(TOutput) * rounded_channels;
+  }
+
+  unsigned int input_rows(void) const
+  {
+    return this->m_args.kernel_rows + (OutputRows - 1)*this->m_args.stride_rows;
+  }
+
+  unsigned int input_cols(void) const
+  {
+    return this->m_args.kernel_cols + (OutputCols - 1)*this->m_args.stride_cols;
+  }
+
+  void execute_tiles(
+    std::function<void(const TInput *const *, TOutput *const *)> tile_fn,
+    std::function<void(TInput *, unsigned int)> initialise_input_buffer,
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const
+  {
+    static_assert(OutputRows * OutputCols <= Strategy::n_output_points,
+                  "Too many output points for kernel.");
+
+    // Determine what portion of the work to do.
+    const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+    const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+    const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+    // Cast input and output pointers into the right types
+    const TInput *const inptr = static_cast<const TInput *>(_input);
+    TOutput *const outptr = static_cast<TOutput *>(_output);
+
+    // Allocate portions of the working space
+    uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + this->get_working_size(thread_id, input_channels);
+    const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space);
+    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space + this->sizeof_input_ptr_array());
+    TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + this->sizeof_input_ptr_array() + this->sizeof_output_buffer(input_channels * this->m_args.channel_multiplier));
+
+    // Create an array for the output pointers
+    TOutput * _outptr_array[Strategy::n_output_points];
+    TOutput **const outptr_array = _outptr_array;
+
+    // Initialise the input buffer
+    initialise_input_buffer(input_buffer, input_channels);
+
+    // For each output tile, construct the requisite set of pointers and call
+    // into the kernel.
+    for (unsigned int batch = 0; batch < batches; batch++)
+    {
+      // Get batch pointers
+      const auto inptr_batch = inptr + batch * ld_input_batch;
+      const auto outptr_batch = outptr + batch * ld_output_batch;
+
+      for (int start_out_i = start_out_height;
+           start_out_i < end_out_height;
+           start_out_i += static_cast<int>(OutputRows))
+      {
+        const int end_out_i = std::min(start_out_i + OutputRows,
+                                       output_height);
+
+        for (int start_out_j = 0;
+             start_out_j < static_cast<int>(output_width);
+             start_out_j += static_cast<int>(OutputCols))
+        {
+          const int end_out_j = std::min(start_out_j + OutputCols,
+                                         output_width);
+
+          // Fill the pointer arrays with pointers to the input/output buffers.
+          for (auto index = 0u;
+               index < (Strategy::n_output_points * this->m_args.kernel_rows * this->m_args.kernel_cols);
+               index++)
+          {
+            inptr_array[index] = input_buffer;
+          }
+          for (auto index = 0u; index < Strategy::n_output_points; index++)
+          {
+            outptr_array[index] = output_buffer;
+          }
+
+          // Construct the pointer arrays together. Note that the input pointer
+          // array is striped. Since the array has already been filled with
+          // pointers to the padding array we merely fill in the valid points
+          // as we get to them.
+          unsigned int output_index = 0;
+          auto outptr_row = outptr_batch + start_out_i * ld_output_row + start_out_j * ld_output_col;
+          for (auto out_i = start_out_i; out_i < end_out_i; out_i++)
+          {
+            auto outptr_col = outptr_row;
+
+            // Compute the padding for this row of tiles.
+            const int start_in_i = out_i * this->m_args.stride_rows - padding.top;
+            const int end_in_i = start_in_i + this->m_args.kernel_rows;
+            const auto pad_top = static_cast<unsigned int>(std::max<int>(0, 0 - start_in_i));
+            const auto pad_bottom = static_cast<unsigned int>(std::max<int>(0, end_in_i - input_height));
+            const unsigned int valid_rows = this->m_args.kernel_rows - pad_top - pad_bottom;
+
+            for (auto out_j = start_out_j; out_j < end_out_j; out_j++, output_index++)
+            {
+              // Compute the output pointer.
+              outptr_array[output_index] = outptr_col;
+              outptr_col += ld_output_col;
+
+              // Compute the padding for this tile.
+              const int start_in_j = out_j * this->m_args.stride_cols - padding.left;
+              const int end_in_j = start_in_j + this->m_args.kernel_cols;
+              const auto pad_left = static_cast<unsigned int>(std::max<int>(0, 0 - start_in_j));
+              const auto pad_right = static_cast<unsigned int>(std::max<int>(0, end_in_j - input_width));
+              const unsigned int valid_cols = this->m_args.kernel_cols - pad_left - pad_right;
+
+              // Hence compute the input pointers.
+              auto input_index = output_index + Strategy::n_output_points * (pad_top * this->m_args.kernel_cols + pad_left);
+              auto inptr_row = inptr_batch + (start_in_i + pad_top) * ld_input_row + (start_in_j + pad_left) * ld_input_col;
+              for (auto in_i = 0u; in_i < valid_rows; in_i++)
+              {
+                auto inptr_col = inptr_row;
+                auto input_index_col = input_index;
+
+                for (auto in_j = 0u; in_j < valid_cols; in_j++)
+                {
+                  inptr_array[input_index_col] = inptr_col;
+                  inptr_col += ld_input_col;
+                  input_index_col += Strategy::n_output_points;
+                }
+
+                inptr_row += ld_input_row;
+                input_index += Strategy::n_output_points * this->m_args.kernel_cols;
+              }
+            }
+
+            outptr_row += ld_output_row;
+          }
+
+          tile_fn(inptr_array, outptr_array);
+        }
+      }
+    }
+  }
+
+  public:
+  DepthwiseDepthfirstGenericBase(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+  {
+  }
+
+  DepthwiseDepthfirstGenericBase(DepthwiseDepthfirstGenericBase &) = delete;
+  DepthwiseDepthfirstGenericBase &operator=(DepthwiseDepthfirstGenericBase &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(Strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl);
+    return (this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
+  }
+
+  void pack_parameters(void *_buffer, const void *, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    // Cast the pointers
+    TWeight *buffer = static_cast<TWeight *>(_buffer);
+    const TWeight *const weights = static_cast<const TWeight *>(_weights);
+
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(Strategy::vl_type);
+    ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col;
+    ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+    for (unsigned int n = 0; n < this->m_args.input_channels; n += vl)
+    {
+      const unsigned int todo = std::min(vl, this->m_args.input_channels - n);
+
+      // Copy each of the weights in turn
+      auto weights_row = weights + n;
+      for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
+      {
+        auto weights_col = weights_row;
+
+        for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
+        {
+          for (unsigned int m = 0; m < todo; m++)
+          {
+            buffer[m] = weights_col[m];
+          }
+          buffer += vl;
+
+          weights_col += ld_weight_col;
+        }
+
+        weights_row += ld_weight_row;
+      }
+    }
+  }
+
+  size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+  {
+    const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+    return n_threads * (sizeof_input_ptr_array() +
+                        sizeof_output_buffer(n_output_channels) +
+                        sizeof_input_buffer(n_channels));
+  }
+};
+
+template <class Strategy, unsigned OutputRows, unsigned int OutputCols>
+class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>
+{
+  using Parent = DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>;
+  using TInput = typename Parent::TInput;
+  using TWeight = typename Parent::TWeight;
+  using TAccum = typename Parent::TAccum;
+  using TOutput = typename Parent::TOutput;
+
+  const TAccum *m_bias = nullptr;
+
+  public:
+  DepthwiseDepthfirstGeneric(const DepthwiseArgs &args) : Parent(args)
+  {
+  }
+
+  DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete;
+  DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete;
+
+  void pack_parameters(void *buffer, const void *bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    m_bias = static_cast<const TAccum *>(bias);
+    Parent::pack_parameters(buffer, bias, weights, ld_weight_col, ld_weight_row);
+  }
+
+  using DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>::execute;
+  void execute(
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const override
+  {
+    Strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+
+    // Compute activation values
+    TAccum activation_min, activation_max;
+    if (std::numeric_limits<TAccum>::is_integer)
+    {
+      activation_min = std::numeric_limits<TAccum>::min();
+      activation_max = std::numeric_limits<TAccum>::max();
+    }
+    else
+    {
+      activation_min = static_cast<TAccum>(-std::numeric_limits<float>::infinity());
+      activation_max = static_cast<TAccum>(std::numeric_limits<float>::infinity());
+    }
+
+    switch (this->m_args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        activation_max = static_cast<TAccum>(this->m_args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        activation_min = static_cast<TAccum>(0);
+        break;
+      default:
+        break;
+    }
+
+    // Create a function to initialise the input buffer
+    const auto initialise_input_buffer = [] (TInput *const buffer, const unsigned int n) {
+      std::memset(buffer, 0, n * sizeof(TInput));
+    };
+
+    // Create a function to execute a tile of work
+    const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) {
+#ifdef CYCLE_PROFILING
+      auto p = prof.ScopedProfiler(
+        PROFILE_KERNEL,
+        (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols)
+      );
+#endif
+      strat.kernel(inptrs, outptrs, parameters, m_bias,
+                   this->m_args.kernel_rows * this->m_args.kernel_cols,
+                   this->m_args.input_channels, activation_min, activation_max);
+    };
+
+    // Call into a parent utility function to do the actual work.
+    Parent::execute_tiles(
+      tile_fn, initialise_input_buffer,
+      batches, input_height, input_width, input_channels, padding,
+      _input, ld_input_col, ld_input_row, ld_input_batch,
+      output_height, output_width,
+      _output, ld_output_col, ld_output_row, ld_output_batch,
+      _working_space, thread_id, n_threads
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp
new file mode 100644
index 0000000..656e441
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp

@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class strategy>
+class DepthwiseDepthfirstGenericWithMultiplierBase :
+  public DepthwiseCommon<typename strategy::input_type,
+                         typename strategy::weight_type,
+                         typename strategy::return_type>
+{
+  protected:
+
+  using TInput = typename strategy::input_type;
+  using TWeight = typename strategy::weight_type;
+  using TOutput = typename strategy::return_type;
+  using TAccum = typename strategy::bias_type;
+
+  unsigned int kernel_points(void) const
+  {
+    return this->m_args.kernel_rows * this->m_args.kernel_cols;
+  }
+
+  unsigned int input_rows(void) const
+  {
+    return (strategy::output_rows() - 1) * this->m_args.stride_rows + this->m_args.kernel_rows;
+  }
+
+  unsigned int input_cols(void) const
+  {
+    return (strategy::output_cols() - 1) * this->m_args.stride_cols + this->m_args.kernel_cols;
+  }
+
+  size_t sizeof_inptr_array(void) const
+  {
+    return sizeof(TInput *) * kernel_points() * strategy::output_rows();
+  }
+
+  size_t sizeof_input_samples(void) const
+  {
+    // We have a sample for each kernel point, for each point of the output array.
+    return sizeof(TInput) * kernel_points() *
+                            strategy::output_rows() *
+                            strategy::output_col_regs() *
+                            (16 / sizeof(TAccum));
+  }
+
+  size_t sizeof_outptr_array(void) const
+  {
+    return sizeof(TOutput *) * strategy::output_rows() * strategy::output_cols();
+  }
+
+  size_t sizeof_output_buffer(unsigned int n_channels) const
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+    return sizeof(TOutput) * rounded_channels;
+  }
+
+  void pack_weights(TWeight *buffer, const TWeight *weights, size_t ld_weight_col, size_t ld_weight_row) const
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
+    ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col;
+    ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+    for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++)
+    {
+      for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl)
+      {
+        const unsigned int out_c = in_c * this->m_args.channel_multiplier + n;
+        const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n);
+
+        // Copy each of the weights in turn
+        auto weights_row = weights + out_c;
+        for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
+        {
+          auto weights_col = weights_row;
+
+          for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
+          {
+            for (unsigned int m = 0; m < todo; m++)
+            {
+              buffer[m] = weights_col[m];
+            }
+            buffer += vl;
+
+            weights_col += ld_weight_col;
+          }
+
+          weights_row += ld_weight_row;
+        }
+      }
+    }
+  }
+
+  void execute_tiles(
+    std::function<void(const TInput **, TOutput **, const TWeight *, unsigned int, unsigned int)> tile_fn,
+    const TInput pad_value,
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const
+  {
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+
+    // Determine what portion of the work to do.
+    const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+    const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+    const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+    // Need a stride over blocks of parameters
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
+    const unsigned int param_stride = arm_gemm::roundup(this->m_args.channel_multiplier, vl) * kernel_points();
+
+    // Cast input and output pointers into the right types
+    const TInput *const inptr = static_cast<const TInput *>(_input);
+    TOutput *const outptr = static_cast<TOutput *>(_output);
+
+    // Allocate portions of the working space
+    uint8_t *working_space = static_cast<uint8_t *>(_working_space) +
+                             get_working_size(thread_id, input_channels);
+
+    const TInput **inptrs = reinterpret_cast<const TInput **>(working_space);
+    working_space += sizeof_inptr_array();
+
+    // To simplify the kernel, we process padded or non-NCHW-ordered input into
+    // a form which can be consumed by the kernel. This data is stored here and
+    // passed into the kernel as an array of N pointers (one per row of the
+    // input).
+    TInput *rearranged_input = reinterpret_cast<TInput *>(working_space);
+    working_space += sizeof_input_samples();
+
+    TOutput **outptr_array = reinterpret_cast<TOutput **>(working_space);
+    working_space += sizeof_outptr_array();
+
+    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+
+    // TODO Dynamically change the input pointer array in cases where we could
+    // read directly from the input tensor; for now though assume we will
+    // always read from the sample array.
+    {
+      auto my_inptrs = inptrs;
+      auto my_input_samples = rearranged_input;
+
+      // For each kernel point; for each row of output; for each register of
+      // values containing a QUAD of source values.
+      const unsigned int quad_length = 16 / sizeof(TAccum);
+
+      for (auto p = 0u; p < kernel_points() * strategy::output_rows(); p++)
+      {
+        *(my_inptrs)++ = my_input_samples;
+        my_input_samples += arm_gemm::roundup(strategy::output_cols(), quad_length);
+      }
+    }
+
+    // For each output tile, construct the requisite set of pointers and call
+    // into the kernel.
+    for (unsigned int batch = 0; batch < batches; batch++)
+    {
+      // Get batch pointers
+      const auto inptr_batch = inptr + batch * ld_input_batch;
+      const auto outptr_batch = outptr + batch * ld_output_batch;
+
+      for (int start_out_i = start_out_height;
+           start_out_i < end_out_height;
+           start_out_i += static_cast<int>(strategy::output_rows()))
+      {
+        const int end_out_i = std::min(start_out_i + static_cast<int>(strategy::output_rows()), end_out_height);
+        const int start_in_i = start_out_i * this->m_args.stride_rows - padding.top;
+        const int end_in_i = start_in_i + input_rows();
+
+        // Compute top/bottom padding
+        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+        const unsigned int valid_output_rows = std::min(
+          end_out_i - start_out_i,
+          static_cast<int>(output_height) - start_out_i
+        );
+
+        const int pad_rows = pad_top + pad_bottom;
+
+        for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+        {
+          const int start_in_j = start_out_j * this->m_args.stride_cols - this->m_args.padding.left;
+          const int pad_left = -std::min(0, start_in_j);
+
+          const int end_out_j = start_out_j + strategy::output_cols();
+          const int end_in_j = start_in_j + input_cols();
+
+          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+          const unsigned int valid_output_cols = std::min(
+            end_out_j - start_out_j,
+            static_cast<int>(output_width) - start_out_j
+          );
+
+          const int pad_cols = pad_left + pad_right;
+
+          // Construct the output pointer array.
+          TOutput **outptr_pos = outptr_array;
+          for (auto i = 0u; i < valid_output_rows; i++)
+          {
+            unsigned int j = 0u;
+            TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+            for (; j < valid_output_cols; j++)
+            {
+              *(outptr_pos++) = colptr;
+               colptr += ld_output_col;
+            }
+            for (; j < strategy::output_cols(); j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+          for (auto i = valid_output_rows; i < strategy::output_rows(); i++)
+          {
+            for (auto j = 0u; j < strategy::output_cols(); j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+
+          start_out_j += strategy::output_cols();
+
+          const TWeight *params = static_cast<const TWeight *>(parameters);
+
+          // Fill the input samples with padding. We can do this outside of
+          // the channel loop, as the position of padding isn't going to
+          // change as a function of channel.
+          for (auto i = 0u; i < kernel_points() * strategy::output_rows() * strategy::output_cols(); i++)
+          {
+            rearranged_input[i] = pad_value;
+          }
+
+          // Loop over the input channels
+          for (unsigned int in_c = 0; in_c < input_channels; in_c++)
+          {
+            auto inptr_row = inptr_batch + in_c +
+                             (start_in_i + pad_top) * ld_input_row +
+                             (start_in_j + pad_left) * ld_input_col;
+
+            // Construct the array of input samples; for each point of the
+            // kernel we provide an input value for each output point.
+            auto input_samples = rearranged_input;
+            for (auto ki = 0u; ki < this->m_args.kernel_rows; ki++)
+            {
+              for (auto kj = 0u; kj < this->m_args.kernel_cols; kj++)
+              {
+                // Copy the pointer for the input samples associated with this
+                // kernel point. Then update the main pointer to account for
+                // this point.
+                auto point_input_samples = input_samples;
+                input_samples += strategy::output_rows() * strategy::output_cols();
+
+                int ii = static_cast<int>(ki) - static_cast<int>(pad_top);
+                for (auto oi = 0u;
+                     oi < strategy::output_rows() &&
+                     ii < static_cast<int>(input_rows()) - pad_rows;
+                     oi++, ii += this->m_args.stride_rows)
+                {
+                  if (0 <= ii) // Fill in values only if this row is in range.
+                  {
+                    int ij = static_cast<int>(kj) - static_cast<int>(pad_left);
+                    for (auto oj = 0u;
+                         oj < strategy::output_cols() &&
+                         ij < static_cast<int>(input_cols()) - pad_cols;
+                         oj++, ij += this->m_args.stride_cols)
+                    {
+                      if (0 <= ij) // Sample if the point is in range.
+                      {
+                        point_input_samples[oj] = *(inptr_row + ii*ld_input_row + ij*ld_input_col);
+                      }
+                    }
+                  }
+
+                  point_input_samples += strategy::output_cols();
+                }
+              }
+            }
+
+            tile_fn(inptrs, outptr_array, params, in_c, in_c*this->m_args.channel_multiplier);
+
+            // Progress the output pointers
+            TOutput **outptr_pos = outptr_array;
+            for (auto i = 0u; i < strategy::output_rows() * strategy::output_cols(); i++)
+            {
+              outptr_pos[i] += this->m_args.channel_multiplier;
+            }
+
+            // Progress the pointer into the parameters
+            params += param_stride;
+          }
+        }
+      }
+    }
+  }
+
+  public:
+  DepthwiseDepthfirstGenericWithMultiplierBase(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+  {
+  }
+
+  DepthwiseDepthfirstGenericWithMultiplierBase(DepthwiseDepthfirstGenericWithMultiplierBase &) = delete;
+  DepthwiseDepthfirstGenericWithMultiplierBase &operator=(DepthwiseDepthfirstGenericWithMultiplierBase &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
+    const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl);
+    return kernel_points() * rounded_channels * sizeof(TWeight);
+  }
+
+  size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+  {
+    const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+    return n_threads * (sizeof_inptr_array() +
+                        sizeof_input_samples() +
+                        sizeof_outptr_array() +
+                        sizeof_output_buffer(n_output_channels));
+  }
+};
+
+template <class strategy>
+class DepthwiseDepthfirstGenericWithMultiplier : public DepthwiseDepthfirstGenericWithMultiplierBase<strategy>
+{
+  using TInput = typename strategy::input_type;
+  using TWeight = typename strategy::weight_type;
+  using TOutput = typename strategy::return_type;
+  using TAccum = typename strategy::bias_type;
+
+  using Parent = DepthwiseDepthfirstGenericWithMultiplierBase<strategy>;
+
+  const TAccum *m_biases;  // Pointer to bias vector
+
+  public:
+  DepthwiseDepthfirstGenericWithMultiplier(const DepthwiseArgs &args)
+    : Parent(args), m_biases(nullptr)
+  {
+  }
+
+  DepthwiseDepthfirstGenericWithMultiplier(DepthwiseDepthfirstGenericWithMultiplier &) = delete;
+  DepthwiseDepthfirstGenericWithMultiplier &operator=(DepthwiseDepthfirstGenericWithMultiplier &) = delete;
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    m_biases = static_cast<const TAccum *>(biases);
+    Parent::pack_weights(static_cast<TAccum *>(buffer), static_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row);
+  }
+
+  using DepthwiseDepthfirstGenericWithMultiplierBase<strategy>::execute;
+  void execute(
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const override
+  {
+    strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+
+    // Compute activation values
+    TAccum activation_min, activation_max;
+    if (std::numeric_limits<TAccum>::is_integer)
+    {
+      activation_min = std::numeric_limits<TAccum>::min();
+      activation_max = std::numeric_limits<TAccum>::max();
+    }
+    else
+    {
+      activation_min = static_cast<TAccum>(-std::numeric_limits<float>::infinity());
+      activation_max = static_cast<TAccum>(std::numeric_limits<float>::infinity());
+    }
+
+    switch (this->m_args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        activation_max = static_cast<TAccum>(this->m_args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        activation_min = static_cast<TAccum>(0);
+        break;
+      default:
+        break;
+    }
+
+    // Get a function to call for each point of the output
+    auto tile_fn = [&] (const TInput **inptrs,
+                        TOutput **outptrs,
+                        const TWeight *weights,
+                        const unsigned int,
+                        const unsigned int start_output_channel) {
+#ifdef CYCLE_PROFILING
+      auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows() * strategy::output_cols() * this->m_args.channel_multiplier * this->m_args.kernel_rows * this->m_args.kernel_cols));
+#endif
+      strat.kernel(
+        inptrs, outptrs, weights,
+        m_biases ? m_biases + start_output_channel : nullptr,
+        this->kernel_points(), this->m_args.channel_multiplier,
+        activation_min, activation_max
+      );
+    };
+
+    Parent::execute_tiles(
+      tile_fn, 0.0f,
+      batches, input_height, input_width, input_channels, padding,
+      _input, ld_input_col, ld_input_row, ld_input_batch,
+      parameters,
+      output_height, output_width,
+      _output, ld_output_col, ld_output_row, ld_output_batch,
+      _working_space, thread_id, n_threads
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp
new file mode 100644
index 0000000..d42382e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include "depthwise_depthfirst_generic_multiplier.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class strategy>
+class DepthwiseDepthfirstGenericWithMultiplierQuantized : public DepthwiseDepthfirstGenericWithMultiplierBase<strategy>
+{
+  using TInput = typename strategy::input_type;
+  using TWeight = typename strategy::weight_type;
+  using TOutput = typename strategy::return_type;
+  using TAccum = typename strategy::bias_type;
+
+  using Parent = DepthwiseDepthfirstGenericWithMultiplierBase<strategy>;
+
+  arm_gemm::Requantize32 m_qp;
+
+  public:
+  DepthwiseDepthfirstGenericWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp)
+    : Parent(args), m_qp(qp)
+  {
+  }
+
+  DepthwiseDepthfirstGenericWithMultiplierQuantized(DepthwiseDepthfirstGenericWithMultiplierQuantized &) = delete;
+  DepthwiseDepthfirstGenericWithMultiplierQuantized &operator=(DepthwiseDepthfirstGenericWithMultiplierQuantized &) = delete;
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    m_qp.bias = static_cast<const TAccum *>(biases);
+    Parent::pack_weights(static_cast<TWeight *>(buffer), static_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row);
+  }
+
+  using Parent::execute;
+  void execute(
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const override
+  {
+    strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+
+    // Get a function to call for each point of the output
+    auto tile_fn = [&] (const TInput **inptrs,
+                        TOutput **outptrs,
+                        const TWeight *weights,
+                        const unsigned int,
+                        const unsigned int start_output_channel) {
+#ifdef CYCLE_PROFILING
+      auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows() * strategy::output_cols() * this->m_args.channel_multiplier * this->m_args.kernel_rows * this->m_args.kernel_cols));
+#endif
+      strat.kernel(
+        inptrs, outptrs, weights,
+        m_qp.bias == nullptr ? nullptr : m_qp.bias + start_output_channel,
+        this->kernel_points(),
+        this->m_args.channel_multiplier,
+        m_qp.per_channel_left_shifts == nullptr ? nullptr : m_qp.per_channel_left_shifts + start_output_channel,
+        m_qp.per_channel_muls == nullptr ? nullptr : m_qp.per_channel_muls + start_output_channel,
+        m_qp.per_channel_right_shifts == nullptr ? nullptr : m_qp.per_channel_right_shifts + start_output_channel,
+        m_qp
+      );
+    };
+
+    Parent::execute_tiles(
+      tile_fn, m_qp.a_offset,
+      batches, input_height, input_width, input_channels, padding,
+      _input, ld_input_col, ld_input_row, ld_input_batch,
+      parameters,
+      output_height, output_width,
+      _output, ld_output_col, ld_output_row, ld_output_batch,
+      _working_space, thread_id, n_threads
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp
new file mode 100644
index 0000000..cfb0d4b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp

@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst_generic.hpp"
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class Strategy, unsigned OutputRows, unsigned int OutputCols>
+class DepthwiseDepthfirstGenericQuantized : public DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>
+{
+  using Parent = DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>;
+  using TInput = typename Parent::TInput;
+  using TAccum = typename Parent::TAccum;
+  using TOutput = typename Parent::TOutput;
+
+  Requantize32 m_qp;
+
+  public:
+  DepthwiseDepthfirstGenericQuantized(const DepthwiseArgs &args, const Requantize32 &qp)
+    : Parent(args), m_qp(qp)
+  {
+  }
+
+  DepthwiseDepthfirstGenericQuantized(DepthwiseDepthfirstGenericQuantized &) = delete;
+  DepthwiseDepthfirstGenericQuantized &operator=(DepthwiseDepthfirstGenericQuantized &) = delete;
+
+  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    m_qp.bias = static_cast<const TAccum *>(biases);
+    Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row);
+  }
+
+  using DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>::execute;
+  void execute(
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const override
+  {
+    Strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+
+    // Create a function to initialise the input buffer
+    const auto initialise_input_buffer = [this] (TInput *const buffer, const unsigned int n) {
+      std::memset(buffer, static_cast<TInput>(m_qp.a_offset), n * sizeof(TInput));
+    };
+
+    // Create a function to execute a tile of work
+    const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) {
+#ifdef CYCLE_PROFILING
+      auto p = prof.ScopedProfiler(
+        PROFILE_KERNEL,
+        (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols)
+      );
+#endif
+      strat.kernel(inptrs, outptrs, parameters, m_qp,
+                   this->m_args.kernel_rows * this->m_args.kernel_cols,
+                   this->m_args.input_channels);
+    };
+
+    // Call into a parent utility function to do the actual work.
+    Parent::execute_tiles(
+      tile_fn, initialise_input_buffer,
+      batches, input_height, input_width, input_channels, padding,
+      _input, ld_input_col, ld_input_row, ld_input_batch,
+      output_height, output_width,
+      _output, ld_output_col, ld_output_row, ld_output_batch,
+      _working_space, thread_id, n_threads
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
new file mode 100644
index 0000000..7c64e0b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp

@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace common
+{
+  template <typename strategy, typename F>
+  void depthwise_multiplier_execute(
+    const F execute_tile,
+    typename strategy::input_type pad_value,
+    const DepthwiseArgs &args,
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const size_t param_stride,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  )
+  {
+    using TInput = typename strategy::input_type;
+    using TOutput = typename strategy::return_type;
+
+    // Determine what portion of the work to do.
+    const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+    const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+    const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+    // Cast input and output pointers into the right types
+    const TInput *const inptr = static_cast<const TInput *>(_input);
+    TOutput *const outptr = static_cast<TOutput *>(_output);
+
+    // To simplify the kernel, we process padded or non-NCHW-ordered input into
+    // a form which can be consumed by the kernel. This data is stored here and
+    // passed into the kernel as an array of N pointers (one per row of the
+    // input).
+    TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*(16 / sizeof(TInput))];
+    const TInput *inptrs[strategy::input_rows];
+
+    // Create an array for the output pointers
+    TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
+    TOutput **const outptr_array = _outptr_array;
+
+    // Allocate portions of the working space
+    uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
+    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+
+    // For each output tile, construct the requisite set of pointers and call
+    // into the kernel.
+    for (unsigned int batch = 0; batch < batches; batch++)
+    {
+      // Get batch pointers
+      const auto inptr_batch = inptr + batch * ld_input_batch;
+      const auto outptr_batch = outptr + batch * ld_output_batch;
+
+      for (int start_out_i = start_out_height;
+           start_out_i < end_out_height;
+           start_out_i += static_cast<int>(strategy::output_rows))
+      {
+        const int end_out_i = start_out_i + strategy::output_rows;
+        const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
+        const int end_in_i = start_in_i + strategy::input_rows;
+
+        // Compute top/bottom padding
+        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+        const unsigned int valid_output_rows = std::min(
+          end_out_i - start_out_i,
+          static_cast<int>(output_height) - start_out_i
+        );
+
+        for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+        {
+          const int start_in_j = start_out_j * strategy::stride_cols - args.padding.left;
+          const int pad_left = -std::min(0, start_in_j);
+
+          const int end_out_j = start_out_j + strategy::output_cols;
+          const int end_in_j = start_in_j + strategy::input_cols;
+
+          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+          const unsigned int valid_output_cols = std::min(
+            end_out_j - start_out_j,
+            static_cast<int>(output_width) - start_out_j
+          );
+
+          // Construct the output pointer array.
+          TOutput **outptr_pos = outptr_array;
+          for (auto i = 0u; i < valid_output_rows; i++)
+          {
+            unsigned int j = 0u;
+            TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+            for (; j < valid_output_cols; j++)
+            {
+              *(outptr_pos++) = colptr;
+               colptr += ld_output_col;
+            }
+            for (; j < strategy::output_cols; j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+          for (auto i = valid_output_rows; i < strategy::output_rows; i++)
+          {
+            for (auto j = 0u; j < strategy::output_cols; j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+
+          start_out_j += strategy::output_cols;
+
+          const uint8_t *params = static_cast<const uint8_t *>(parameters);
+
+          // Loop over the input channels
+          for (unsigned int in_c = 0; in_c < input_channels; in_c++)
+          {
+            // Construct the input array - first fill with padding values and
+            // then fill in correct values.
+            for (unsigned int i = 0; i < strategy::input_rows; i++)
+            {
+              for (unsigned int j = 0;
+                   j < (16 / sizeof(TInput)) * strategy::input_col_quads; j++)
+              {
+                rearranged_input[i][j] = pad_value;
+              }
+              inptrs[i] = rearranged_input[i];
+            }
+
+            auto inptr_row = inptr_batch + in_c +
+                             (start_in_i + pad_top) * ld_input_row +
+                             (start_in_j + pad_left) * ld_input_col;
+            if (ld_input_col == 1 && !pad_left &&
+                start_in_j + (16 / sizeof(TInput)) * strategy::input_col_quads < input_width)
+            {
+              // The input tensor is already in NCHW format, and we're reading
+              // an unpadded section of it - allow the kernel to read it
+              // directly.
+              for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+              {
+                inptrs[i] = inptr_row;
+                inptr_row += ld_input_row;
+              }
+            }
+            else
+            {
+              // Either the input tensor isn't in NCHW format, or we're reading
+              // a padded section. Copy the relevant portion of the input here
+              // and allow the kernel to read this.
+              for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+              {
+                auto inptr_col = inptr_row;
+                for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
+                {
+                  rearranged_input[i][j] = *inptr_col;
+                  inptr_col += ld_input_col;
+                }
+                inptr_row += ld_input_row;
+              }
+            }
+
+            execute_tile(inptrs, outptr_array, params);
+
+            // Progress the output pointers
+            TOutput **outptr_pos = outptr_array;
+            for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
+            {
+              outptr_pos[i] += args.channel_multiplier;
+            }
+
+            // Progress the pointer into the parameters
+            params += param_stride;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <class strategy>
+class DepthwiseDepthfirstWithMultiplier :
+  public DepthwiseCommon<typename strategy::input_type,
+                         typename strategy::weight_type,
+                         typename strategy::return_type>
+{
+  using TInput = typename strategy::input_type;
+  using TWeight = typename strategy::weight_type;
+  using TOutput = typename strategy::return_type;
+  using TAccum = typename strategy::bias_type;
+
+  size_t sizeof_output_buffer(unsigned int n_channels) const
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+    return sizeof(TOutput) * rounded_channels;
+  }
+
+  public:
+  DepthwiseDepthfirstWithMultiplier(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+  {
+  }
+
+  DepthwiseDepthfirstWithMultiplier(DepthwiseDepthfirstWithMultiplier &) = delete;
+  DepthwiseDepthfirstWithMultiplier &operator=(DepthwiseDepthfirstWithMultiplier &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    // TODO What if we insert extra padding? Biases are a different size to the inputs, ...
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
+    const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl);
+    return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
+  }
+
+  void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    // TODO What if the kernel needs a different packing function?
+
+    // Cast the pointers
+    float *buffer = static_cast<float *>(_buffer);
+    const float *biases = static_cast<const float *>(_biases);
+    const float *const weights = static_cast<const float *>(_weights);
+
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
+    ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col;
+    ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+    for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++)
+    {
+      for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl)
+      {
+        const unsigned int out_c = in_c * this->m_args.channel_multiplier + n;
+        const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n);
+
+        // Copy across the correct amount of bias (or 0)
+        for (unsigned int i = 0; i < todo; i++)
+        {
+          buffer[i] = (biases == nullptr) ? 0 : biases[out_c + i];
+        }
+        buffer += vl;
+
+        // Copy each of the weights in turn
+        auto weights_row = weights + out_c;
+        for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
+        {
+          auto weights_col = weights_row;
+
+          for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
+          {
+            for (unsigned int m = 0; m < todo; m++)
+            {
+              buffer[m] = weights_col[m];
+            }
+            buffer += vl;
+
+            weights_col += ld_weight_col;
+          }
+
+          weights_row += ld_weight_row;
+        }
+      }
+    }
+  }
+
+  size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+  {
+    const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+    return n_threads * sizeof_output_buffer(n_output_channels);
+  }
+  
+  using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
+  void execute(
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const override
+  {
+    strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+
+    // Compute activation values
+    TAccum activation_min = std::numeric_limits<TAccum>::has_infinity ? -std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::min();
+    TAccum activation_max = std::numeric_limits<TAccum>::has_infinity ? std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::max();
+
+    switch (this->m_args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        activation_max = static_cast<TAccum>(this->m_args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        activation_min = static_cast<TAccum>(0);
+        break;
+      default:
+        break;
+    }
+
+    // Determine what portion of the work to do.
+    const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+    const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+    const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+    // Need a stride over blocks of parameters
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
+    const unsigned int param_stride =
+      arm_gemm::roundup(this->m_args.channel_multiplier, vl) *
+      (sizeof(TAccum) + sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols);
+
+    // Cast input and output pointers into the right types
+    const TInput *const inptr = static_cast<const TInput *>(_input);
+    TOutput *const outptr = static_cast<TOutput *>(_output);
+
+    // To simplify the kernel, we process padded or non-NCHW-ordered input into
+    // a form which can be consumed by the kernel. This data is stored here and
+    // passed into the kernel as an array of N pointers (one per row of the
+    // input).
+    TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4];
+    const TInput *inptrs[strategy::input_rows];
+
+    // Create an array for the output pointers
+    TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
+    TOutput **const outptr_array = _outptr_array;
+
+    // Allocate portions of the working space
+    uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + get_working_size(thread_id, input_channels);
+    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+
+    // For each output tile, construct the requisite set of pointers and call
+    // into the kernel.
+    for (unsigned int batch = 0; batch < batches; batch++)
+    {
+      // Get batch pointers
+      const auto inptr_batch = inptr + batch * ld_input_batch;
+      const auto outptr_batch = outptr + batch * ld_output_batch;
+
+      for (int start_out_i = start_out_height;
+           start_out_i < end_out_height;
+           start_out_i += static_cast<int>(strategy::output_rows))
+      {
+        const int end_out_i = start_out_i + strategy::output_rows;
+        const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
+        const int end_in_i = start_in_i + strategy::input_rows;
+
+        // Compute top/bottom padding
+        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+        const unsigned int valid_output_rows = std::min(
+          end_out_i - start_out_i,
+          static_cast<int>(output_height) - start_out_i
+        );
+
+        for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+        {
+          const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
+          const int pad_left = -std::min(0, start_in_j);
+
+          const int end_out_j = start_out_j + strategy::output_cols;
+          const int end_in_j = start_in_j + strategy::input_cols;
+
+          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+          const unsigned int valid_output_cols = std::min(
+            end_out_j - start_out_j,
+            static_cast<int>(output_width) - start_out_j
+          );
+
+          // Construct the output pointer array.
+          TOutput **outptr_pos = outptr_array;
+          for (auto i = 0u; i < valid_output_rows; i++)
+          {
+            unsigned int j = 0u;
+            TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+            for (; j < valid_output_cols; j++)
+            {
+              *(outptr_pos++) = colptr;
+               colptr += ld_output_col;
+            }
+            for (; j < strategy::output_cols; j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+          for (auto i = valid_output_rows; i < strategy::output_rows; i++)
+          {
+            for (auto j = 0u; j < strategy::output_cols; j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+
+          start_out_j += strategy::output_cols;
+
+          const uint8_t *params = static_cast<const uint8_t *>(parameters);
+
+          // Loop over the input channels
+          for (unsigned int in_c = 0; in_c < input_channels; in_c++)
+          {
+            // Construct the input array - first fill with padding values and
+            // then fill in correct values.
+            for (unsigned int i = 0; i < strategy::input_rows; i++)
+            {
+              for (unsigned int j = 0; j < 4 * strategy::input_col_quads; j++)
+              {
+                rearranged_input[i][j] = static_cast<TInput>(0);
+              }
+              inptrs[i] = rearranged_input[i];
+            }
+
+            auto inptr_row = inptr_batch + in_c +
+                             (start_in_i + pad_top) * ld_input_row +
+                             (start_in_j + pad_left) * ld_input_col;
+            if (ld_input_col == 1 && !pad_left &&
+                start_in_j + 4 * strategy::input_col_quads < input_width)
+            {
+              // The input tensor is already in NCHW format, and we're reading
+              // an unpadded section of it - allow the kernel to read it
+              // directly.
+              for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+              {
+                inptrs[i] = inptr_row;
+                inptr_row += ld_input_row;
+              }
+            }
+            else
+            {
+              // Either the input tensor isn't in NCHW format, or we're reading
+              // a padded section. Copy the relevant portion of the input here
+              // and allow the kernel to read this.
+              for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+              {
+                auto inptr_col = inptr_row;
+                for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
+                {
+                  rearranged_input[i][j] = *inptr_col;
+                  inptr_col += ld_input_col;
+                }
+                inptr_row += ld_input_row;
+              }
+            }
+
+            {
+#ifdef CYCLE_PROFILING
+              auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols));
+#endif
+              strat.kernel(
+                inptrs, outptr_array, params,
+                this->m_args.channel_multiplier,
+                activation_min, activation_max
+              );
+            }
+
+            // Progress the output pointers
+            TOutput **outptr_pos = outptr_array;
+            for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
+            {
+              outptr_pos[i] += this->m_args.channel_multiplier;
+            }
+
+            // Progress the pointer into the parameters
+            params += param_stride;
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp
new file mode 100644
index 0000000..07ce0d3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp

@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst_multiplier.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class strategy>
+class DepthwiseDepthfirstWithMultiplierQuantized :
+  public DepthwiseCommon<typename strategy::input_type,
+                         typename strategy::weight_type,
+                         typename strategy::return_type>
+{
+  using Parent = DepthwiseCommon<typename strategy::input_type,
+                                 typename strategy::weight_type,
+                                 typename strategy::return_type>;
+  using TInput = typename strategy::input_type;
+  using TWeight = typename strategy::weight_type;
+  using TOutput = typename strategy::return_type;
+
+  const arm_gemm::Requantize32 m_qp;
+
+  size_t sizeof_output_buffer(unsigned int n_channels) const
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<typename strategy::return_type>(strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+    return sizeof(typename strategy::return_type) * rounded_channels;
+  }
+
+  public:
+  DepthwiseDepthfirstWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp)
+    : Parent(args), m_qp(qp)
+  {
+  }
+
+  DepthwiseDepthfirstWithMultiplierQuantized(DepthwiseDepthfirstWithMultiplierQuantized &) = delete;
+  DepthwiseDepthfirstWithMultiplierQuantized &operator=(DepthwiseDepthfirstWithMultiplierQuantized &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    // We produce VL<int32_t> channels at a time, for each of these blocks of
+    // channels we store a vector of biases, weights (complicated) and
+    // requantize parameters.
+    const unsigned int iter_length =
+      arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
+    const unsigned int n_iters =
+      this->m_args.input_channels * arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
+
+    // Compute the cost of storing the weights
+    const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
+
+    return n_iters * iter_length * (
+        sizeof(int32_t) +  // Bias
+        4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(TWeight) +  // Weights
+        2 * sizeof(int32_t)  // Requantisation parameters
+    );
+  }
+
+  // We'll want an optimised version of this, but for now a C++ implementation
+  // is probably sufficient.
+  void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    auto buffer = static_cast<uint8_t *>(_buffer);
+    auto biases = static_cast<const int32_t *>(_biases);
+    auto weights = static_cast<const TWeight *>(_weights);
+    auto requant_muls = m_qp.per_channel_muls;
+    auto requant_shifts = m_qp.per_channel_right_shifts;
+
+    const unsigned int iter_length =
+      arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
+    const unsigned int n_iters_per_input_channel =
+      arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
+
+    const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
+
+    const size_t iter_stride = iter_length * (
+        sizeof(int32_t) +  // Bias
+        4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) +  // Weights
+        2 * sizeof(int32_t)  // Requantisation parameters
+    );
+
+    ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels * this->m_args.channel_multiplier : ld_weight_col;
+    ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+    for (unsigned int input_channel = 0; input_channel < this->m_args.input_channels; input_channel++)
+    {
+      auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
+      auto weights_input_channel = weights + input_channel * this->m_args.channel_multiplier;
+
+      for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
+      {
+        // Get a pointer to the start of this portion of the buffer; consequently
+        // derive pointers to the bias, weight and requantisation portions of
+        // this frame.
+        auto buffer_base = buffer_input_channel + iter_stride * iter;
+        auto buffer_biases = reinterpret_cast<int32_t *>(buffer_base);
+        auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length;
+        auto buffer_requant_mul = reinterpret_cast<int32_t *>(
+          buffer_weights + strategy::kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
+        auto buffer_requant_shift = buffer_requant_mul + iter_length;
+        auto weights_base = weights_input_channel + iter * iter_length;
+
+        // Hence work through the data for this iteration, on a
+        // channel-by-channel basis.
+        const auto this_iter_length = std::min<unsigned int>(
+          iter_length, this->m_args.channel_multiplier - iter * iter_length
+        );
+        for (unsigned int i = 0; i < this_iter_length; i++)
+        {
+          auto weights_channel = weights_base + i;
+
+          // Read the bias value, we modify this as we read the weights.
+          auto bias_value = biases == nullptr ? 0 : *(biases++);
+          int32_t elements_sum = 0;
+
+          // Read through the kernel; for each row, marshal together as many dot
+          // product terms as are required.
+          for (unsigned int ki = 0; ki < strategy::kernel_rows; ki++)
+          {
+            auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length;
+            auto weights_row = weights_channel + ki * ld_weight_row;
+
+            unsigned int kj = 0;
+            for (; kj < strategy::kernel_cols; kj++)
+            {
+              // Determine which element to which we're writing
+              const auto dot = kj / 4;
+              const auto elem = kj % 4;
+
+              // Copy the value; include in the sum
+              const auto val = weights_row[kj * ld_weight_col];
+              buffer_row[dot * 4 * iter_length + elem] = val;
+              elements_sum += val;
+            }
+            for (; kj < 4 * n_dots_per_kernel_row; kj++)
+            {
+              const auto dot = kj / 4;
+              const auto elem = kj % 4;
+              buffer_row[dot * 4 * iter_length + elem] = 0;
+            }
+
+            buffer_row += 4 * n_dots_per_kernel_row * iter_length;
+          }
+
+          // Write back the bias and offset values
+          *(buffer_biases++) =
+            bias_value - m_qp.a_offset * elements_sum +
+            strategy::kernel_rows * strategy::kernel_cols * m_qp.a_offset * m_qp.b_offset;
+
+          // Write out the requantisation parameters
+          *(buffer_requant_mul++) = m_qp.per_channel_requant ? *(requant_muls++) : m_qp.per_layer_mul;
+          *(buffer_requant_shift++) = m_qp.per_channel_requant ? *(requant_shifts++) : m_qp.per_layer_right_shift;
+        }
+      }
+    }
+  }
+
+  size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+  {
+    const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+    return n_threads * sizeof_output_buffer(n_output_channels);
+  }
+
+  using Parent::execute;
+  void execute(
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *const _working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const override
+  {
+    strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+
+    auto executefn = [strat, this] (
+      const TInput *const *const inptrs,
+      TOutput *const *const outptr_array,
+      const void *const params
+    ) {
+      strat.kernel(inptrs, outptr_array, params, this->m_args.channel_multiplier, m_qp);
+    };
+
+    // Get working space for this thread
+    uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + get_working_size(1, input_channels) * thread_id;
+
+    // Determine the stride across blocks of parameters
+    const unsigned int iter_length =
+      arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
+    const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
+    const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
+    const size_t param_stride = n_iters_per_input_channel * iter_length * (
+        sizeof(int32_t) +  // Bias
+        4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) +  // Weights
+        2 * sizeof(int32_t)  // Requantisation parameters
+    );
+
+    common::depthwise_multiplier_execute<strategy>(
+      executefn, m_qp.a_offset, this->m_args,
+      batches, input_height, input_width, input_channels, padding,
+      _input, ld_input_col, ld_input_row, ld_input_batch,
+      parameters, param_stride,
+      output_height, output_width,
+      _output, ld_output_col, ld_output_row, ld_output_batch,
+      working_space, thread_id, n_threads
+    );
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp
new file mode 100644
index 0000000..f97569e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp

@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+
+// We have two sets of quantized kernels; those which use the dot-product
+// instructions and which require the biases and quantisation parameters to be
+// ravelled into weights/parameter array, and those which use the MLAL
+// instructions and which consume separate bias and quantisation parameter
+// arrays. The following code adapts these two sets of kernels to use the same
+// API - allowing the same driver loop to call them both.
+
+template <typename TIn, typename TWeight, typename TOut>
+using UnravelledKernFn = std::function<void(unsigned int, const TIn *const *, const TWeight *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, TOut *const *)>;
+
+template <typename TIn, typename TOut>
+using RavelledKernFn = std::function<void(const TIn *const *, TOut *const *, const void *, uint64_t, const arm_gemm::Requantize32 &)>;
+
+template <typename TIn, typename TWeight, typename TOut>
+const UnravelledKernFn<TIn, TWeight, TOut> get_unified_kernel(const UnravelledKernFn<TIn, TWeight, TOut> &f) { return f; }
+
+template <typename TIn, typename TWeight, typename TOut>
+const UnravelledKernFn<TIn, TWeight, TOut> get_unified_kernel(const RavelledKernFn<TIn, TOut> &f)
+{
+  return [f] (const unsigned int n_channels,
+              const TIn *const *const inptrs,
+              const TWeight *const weights,
+              const int32_t *,  // Bias (ravelled)
+              const arm_gemm::Requantize32 &qp,
+              const int32_t *,  // Requantisation muls (ravelled)
+              const int32_t *,  // Requantisation shifts (ravelled)
+              TOut *const *const outptrs) {
+    return f(inptrs, outptrs, weights, n_channels, qp);
+  };
+}
+
+template <typename T>
+using UnravelledPackingFn = std::function<void(unsigned int, void *, const T *, size_t, size_t)>;
+
+template <typename T>
+using RavelledPackingFn = std::function<void(unsigned int, void *, const int32_t *, const T *, const arm_gemm::Requantize32 &, size_t, size_t)>;
+
+template <typename T>
+const RavelledPackingFn<T> get_unified_packer(const UnravelledPackingFn<T> &f)
+{
+  return [f] (const unsigned int n_channels,
+              void *buffer,
+              const int32_t *,  // Bias
+              const T *weights,
+              const arm_gemm::Requantize32 &,
+              size_t ld_weight_col,
+              size_t ld_weight_row)
+  {
+    return f(n_channels, buffer, weights, ld_weight_col, ld_weight_row);
+  };
+}
+
+template <typename T>
+const RavelledPackingFn<T> get_unified_packer(const RavelledPackingFn<T> &f) { return f; }
+
+template <typename T>
+constexpr bool requires_unravelled_bias_and_quant_params(const UnravelledPackingFn<T> &) { return true; }
+
+template <typename T>
+constexpr bool requires_unravelled_bias_and_quant_params(const RavelledPackingFn<T> &) { return false; }
+
+template <class strategy>
+constexpr bool strategy_requires_unravelled_bias_and_quant_params(void)
+{
+  return requires_unravelled_bias_and_quant_params<typename strategy::weight_type>(strategy::pack_parameters);
+}
+
+}
+
+template <class strategy>
+class DepthwiseDepthfirstQuantized :
+  public DepthwiseCommon<typename strategy::input_type,
+                         typename strategy::weight_type,
+                         typename strategy::return_type>
+{
+  using TInput = typename strategy::input_type;
+  using TWeight = typename strategy::weight_type;
+  using TOutput = typename strategy::return_type;
+  using TAccum = typename strategy::bias_type;
+
+  arm_gemm::Requantize32 m_qp;
+
+  size_t sizeof_input_buffer(unsigned int n_channels) const
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+    return sizeof(TInput) * rounded_channels;
+  }
+
+  size_t sizeof_output_buffer(unsigned int n_channels) const
+  {
+    const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
+    const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+    return sizeof(TOutput) * rounded_channels;
+  }
+
+  size_t sizeof_bias_buffer(unsigned int n_channels) const
+  {
+    if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+    {
+      return (m_qp.bias == nullptr) ? sizeof(TAccum) * n_channels : 0;
+    }
+
+    return 0;
+  }
+
+  size_t sizeof_requant_mul_buffer(unsigned int n_channels) const
+  {
+    if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+    {
+      return m_qp.per_channel_requant ? 0 : sizeof(int32_t) * n_channels;
+    }
+
+    return 0;
+  }
+
+  size_t sizeof_requant_shift_buffer(unsigned int n_channels) const
+  {
+    if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+    {
+      return m_qp.per_channel_requant ? 0 : sizeof(int32_t) * n_channels;
+    }
+
+    return 0;
+  }
+
+  public:
+  DepthwiseDepthfirstQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp)
+    : DepthwiseCommon<TInput, TWeight, TOutput>(args), m_qp(qp)
+  {
+  }
+
+  DepthwiseDepthfirstQuantized(DepthwiseDepthfirstQuantized &) = delete;
+  DepthwiseDepthfirstQuantized &operator=(DepthwiseDepthfirstQuantized &) = delete;
+
+  size_t get_storage_size(void) const override
+  {
+    return strategy::get_packed_size(this->m_args);
+  }
+
+  void pack_parameters(void *buffer, const void *const bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+  {
+    if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+    {
+      m_qp.bias = static_cast<const int32_t *>(bias);
+    }
+
+    get_unified_packer<TWeight>(strategy::pack_parameters)(
+      this->m_args.input_channels,
+      buffer,
+      static_cast<const int32_t *>(bias),
+      reinterpret_cast<const TWeight *>(weights),
+      m_qp,
+      ld_weight_col,
+      ld_weight_row
+    );
+  }
+
+  size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+  {
+    const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+    return n_threads * (
+      sizeof_output_buffer(n_output_channels) +
+      sizeof_input_buffer(n_channels) +
+      sizeof_bias_buffer(n_channels) +
+      sizeof_requant_mul_buffer(n_channels) +
+      sizeof_requant_shift_buffer(n_channels)
+    );
+  }
+
+  using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
+  void execute(
+    const unsigned int batches,
+    const unsigned int input_height,
+    const unsigned int input_width,
+    const unsigned int input_channels,
+    const PaddingValues &padding,
+    const void *const _input,
+    const size_t ld_input_col,
+    const size_t ld_input_row,
+    const size_t ld_input_batch,
+    const void *const parameters,
+    const unsigned int output_height,
+    const unsigned int output_width,
+    void *const _output,
+    const size_t ld_output_col,
+    const size_t ld_output_row,
+    const size_t ld_output_batch,
+    void *_working_space,
+    const unsigned int thread_id,
+    const unsigned int n_threads
+  ) const override
+  {
+    strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+    arm_gemm::profiler prof;
+#endif
+    // Get a unified API for the kernel function
+    auto kernel = get_unified_kernel<TInput, TWeight, TOutput>(strat.kernel);
+
+    // Determine what portion of the work to do.
+    const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+    const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+    const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+    // Cast input and output pointers into the right types
+    const TInput *const inptr = static_cast<const TInput *>(_input);
+    TOutput *const outptr = static_cast<TOutput *>(_output);
+
+    // Create an array for the input pointers
+    const TInput * _inptr_array[strategy::input_rows * strategy::input_cols];
+    const TInput **const inptr_array = _inptr_array;
+
+    // Create an array for the output pointers
+    TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
+    TOutput **const outptr_array = _outptr_array;
+
+    // Allocate portions of the working space
+    uint8_t *working_space = static_cast<uint8_t *>(_working_space) + get_working_size(thread_id, input_channels);
+
+    TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+    working_space += sizeof_output_buffer(input_channels * this->m_args.channel_multiplier);
+
+    TInput *const input_buffer = reinterpret_cast<TInput *>(working_space);
+    working_space += sizeof_input_buffer(input_channels);
+
+    const int32_t *const bias_ptr = (m_qp.bias == nullptr) ? reinterpret_cast<int32_t *>(working_space)
+                                                           : m_qp.bias;
+    working_space += sizeof_bias_buffer(input_channels * this->m_args.channel_multiplier);
+
+    const int32_t *const requant_mul_vec = !m_qp.per_channel_requant ? reinterpret_cast<int32_t *>(working_space)
+                                                                     : m_qp.per_channel_muls;
+    working_space += sizeof_requant_mul_buffer(input_channels * this->m_args.channel_multiplier);
+
+    const int32_t *const requant_shift_vec = !m_qp.per_channel_requant ? reinterpret_cast<int32_t *>(working_space)
+                                                                       : m_qp.per_channel_right_shifts;
+
+    if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+    {
+      // Initialise the bias buffer
+      if (m_qp.bias == nullptr)
+      {
+        for (unsigned int c = 0; c < input_channels * this->m_args.channel_multiplier; c++)
+        {
+          const_cast<int32_t *>(bias_ptr)[c] = 0;
+        }
+      }
+
+      // Initialise the requantisation parameters
+      if (!m_qp.per_channel_requant)
+      {
+        for (unsigned int c = 0; c < input_channels * this->m_args.channel_multiplier; c++)
+        {
+          const_cast<int32_t *>(requant_mul_vec)[c] = m_qp.per_layer_mul;
+          const_cast<int32_t *>(requant_shift_vec)[c] = m_qp.per_layer_right_shift;
+        }
+      }
+    }
+
+    // Initialise the input buffer
+    for (unsigned int c = 0; c < input_channels; c++)
+    {
+      input_buffer[c] = static_cast<TInput>(m_qp.a_offset);
+    }
+
+    // For each output tile, construct the requisite set of pointers and call
+    // into the kernel.
+    for (unsigned int batch = 0; batch < batches; batch++)
+    {
+      // Get batch pointers
+      const auto inptr_batch = inptr + batch * ld_input_batch;
+      const auto outptr_batch = outptr + batch * ld_output_batch;
+
+      for (int start_out_i = start_out_height;
+           start_out_i < end_out_height;
+           start_out_i += static_cast<int>(strategy::output_rows))
+      {
+        const int end_out_i = start_out_i + strategy::output_rows;
+        const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
+        const int end_in_i = start_in_i + strategy::input_rows;
+
+        // Compute top/bottom padding
+        const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+        const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+        const unsigned int valid_output_rows = std::min(
+          end_out_i - start_out_i,
+          static_cast<int>(output_height) - start_out_i
+        );
+
+        // Fill the input pointer array with padding values
+        for (auto index = 0u; index < strategy::input_rows * strategy::input_cols; index++)
+        {
+          inptr_array[index] = input_buffer;
+        }
+
+        for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+        {
+          const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
+          const int pad_left = -std::min(0, start_in_j);
+
+          const int end_out_j = start_out_j + strategy::output_cols;
+          const int end_in_j = start_in_j + strategy::input_cols;
+
+          const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+          const unsigned int valid_output_cols = std::min(
+            end_out_j - start_out_j,
+            static_cast<int>(output_width) - start_out_j
+          );
+
+          // Construct the input pointer array - fill the array with pointers to
+          // the input buffer and then fill in the required values.
+          for (auto i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+          {
+            // Can skip over the left padding because we will have either the
+            // same or less than the previous tile.
+            unsigned int j = pad_left;
+            const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
+            const TInput **ptrs = inptr_array + i * strategy::input_cols + j;
+            for (; j < strategy::input_cols - pad_right; j++)
+            {
+              *(ptrs++) = colptr;
+              colptr += ld_input_col;
+            }
+            for (; j < strategy::input_cols; j++)
+            {
+              *(ptrs++) = input_buffer;
+            }
+          }
+
+          // Construct the output pointer array.
+          TOutput **outptr_pos = outptr_array;
+          for (auto i = 0u; i < valid_output_rows; i++)
+          {
+            unsigned int j = 0u;
+            TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+            for (; j < valid_output_cols; j++)
+            {
+              *(outptr_pos++) = colptr;
+               colptr += ld_output_col;
+            }
+            for (; j < strategy::output_cols; j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+          for (auto i = valid_output_rows; i < strategy::output_rows; i++)
+          {
+            for (auto j = 0u; j < strategy::output_cols; j++)
+            {
+              *(outptr_pos++) = output_buffer;
+            }
+          }
+
+          start_out_j += strategy::output_cols;
+
+#ifdef CYCLE_PROFILING
+          // TODO Work number
+          auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.kernel_rows * this->m_args.kernel_cols));
+#endif
+          kernel(
+            this->m_args.input_channels,
+            inptr_array,
+            reinterpret_cast<const TWeight *>(parameters),
+            bias_ptr, m_qp, requant_mul_vec, requant_shift_vec,
+            outptr_array
+          );
+        }
+      }
+    }
+  }
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
new file mode 100644
index 0000000..fdb36fc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp

@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_depthfirst_generic_multiplier.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+#include "kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif  // defined(__ARM_FEATURE_SVE)
+#include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+  template <class Strategy>
+  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+  {
+    return std::numeric_limits<unsigned int>::max();
+  }
+
+  unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &)
+  {
+    return args.channel_multiplier > 1 ? 0 : std::numeric_limits<unsigned int>::max();
+  }
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+}
+
+#if defined(__ARM_FP16_ARGS)
+
+static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+              has_no_channel_multiplier),
+    cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>(args);
+    },
+  },
+#endif  // defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirst<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
+    constraint(has_no_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirstGeneric<a64_fp16_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    nullptr,
+    not_preferred_if_no_multiplier,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+      return new DepthwiseDepthfirstGenericWithMultiplier<a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+    },
+  },
+#endif  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<__fp16> *depthwise_implementation_list()
+{
+  return depthwise_fp16_methods;
+}
+
+template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &);
+
+#endif  // defined(__ARM_FP16_ARGS)
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
new file mode 100644
index 0000000..aea750a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp

@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_depthfirst_generic_multiplier.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+#include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__ARM_FEATURE_SVE)
+#include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+  template <class Strategy>
+  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
+  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+  {
+    return std::numeric_limits<unsigned int>::max();
+  }
+
+  unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &)
+  {
+    return args.channel_multiplier > 1 ? 0 : std::numeric_limits<unsigned int>::max();
+  }
+}
+
+static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+              has_no_channel_multiplier),
+    cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
+    constraint(has_no_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirstGeneric<sve_fp32_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+    constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>),
+    not_preferred_if_no_multiplier,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirstWithMultiplier<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+    constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>),
+    not_preferred_if_no_multiplier,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirstWithMultiplier<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    nullptr,
+    not_preferred_if_no_multiplier,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirstGenericWithMultiplier<sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+    },
+  },
+#endif  // defined(__ARM_FEATURE_SVE)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirst<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
+    constraint(has_no_channel_multiplier),
+    not_preferred,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirstGeneric<a64_fp32_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+    constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>),
+    not_preferred_if_no_multiplier,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirstWithMultiplier<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+    constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>),
+    not_preferred_if_no_multiplier,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirstWithMultiplier<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>(args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    nullptr,
+    not_preferred_if_no_multiplier,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      return new DepthwiseDepthfirstGenericWithMultiplier<a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<float> *depthwise_implementation_list()
+{
+  return depthwise_fp32_methods;
+}
+
+template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
new file mode 100644
index 0000000..1d52b56
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp

@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+
+#include <cstddef>
+#include <functional>
+
+using arm_gemm::Nothing;
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+struct DepthwiseImplementation
+{
+  const DepthwiseMethod method;
+  const char *name;
+  std::function<bool(const DepthwiseArgs &, const OutputStage &)> is_supported;
+  std::function<uint64_t(const DepthwiseArgs &, const OutputStage &)> cycle_estimate;
+  std::function<DepthwiseCommon<TInput, TWeight, TOutput> *(const DepthwiseArgs &, const OutputStage &)> initialise;
+
+  bool get_is_supported(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    return (is_supported == nullptr) ? true : is_supported(args, os);
+  }
+
+  uint64_t get_cycle_estimate(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os);
+  }
+
+  DepthwiseCommon<TInput, TWeight, TOutput> *get_instance(const DepthwiseArgs &args, const OutputStage &os) const
+  {
+    return initialise(args, os);
+  }
+};
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *depthwise_implementation_list();
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+bool find_implementation(
+  const DepthwiseArgs &args,
+  const OutputStage &os,
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> * &selected
+)
+{
+  selected = nullptr;
+  uint64_t best_cycle_estimate = UINT64_MAX;
+
+  const auto *impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+  for (; impl->method != DepthwiseMethod::DEFAULT; impl++)
+  {
+    const bool has_cfg = (args.config != nullptr);
+    const auto &cfg = args.config;
+
+    if (
+      !impl->get_is_supported(args, os) ||  // Problem is unsupported
+      (has_cfg && cfg->method != DepthwiseMethod::DEFAULT && cfg->method != impl->method) ||
+      (has_cfg && cfg->filter != "" && !std::strstr(impl->name, cfg->filter.c_str()))
+    )
+    {
+      continue;
+    }
+
+    const auto cycle_estimate = impl->get_cycle_estimate(args, os);
+
+    if (cycle_estimate == 0)
+    {
+      selected = impl;
+      break;
+    }
+
+    if (selected == nullptr || cycle_estimate < best_cycle_estimate)
+    {
+      selected = impl;
+      best_cycle_estimate = cycle_estimate;
+    }
+  }
+
+  return (selected != nullptr);
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &args, const OutputStage &os)
+{
+  std::vector<KernelDescription> kerns;
+
+  // Find the default implementation so we can flag it accordingly
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *default_impl;
+  find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, default_impl);
+
+  for (auto impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+       impl->method != DepthwiseMethod::DEFAULT; impl++)
+  {
+    if (!impl->get_is_supported(args, os))
+    {
+      continue;
+    }
+
+    kerns.emplace_back(
+      impl->method, impl->name, impl == default_impl,
+      impl->get_cycle_estimate(args, os)
+    );
+  }
+
+  return kerns;
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &args, const OutputStage &os)
+{
+  const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
+  const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
+  return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
new file mode 100644
index 0000000..b4814be
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Utilities for constructing functions which constrain which kernels are
+ * selected for a given depthwise problem.
+ *
+ * It is expected that this will be included in the files which list the
+ * available kernels. To avoid multiple definitions, an anonymous namespace is
+ * used.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "depthwise.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+namespace
+{
+
+template <class OutputStage>
+using ConstraintFn = std::function<bool(const DepthwiseArgs &, const OutputStage &)>;
+
+using GenericConstraintFn = std::function<bool(const DepthwiseArgs &, const void *)>;
+
+GenericConstraintFn make_constraint(const GenericConstraintFn &f) __attribute__ ((unused));
+GenericConstraintFn make_constraint(const GenericConstraintFn &f)
+{
+  return f;
+}
+
+template <typename ... Fs>
+GenericConstraintFn make_constraint(const GenericConstraintFn &f, Fs ... fs)
+{
+  return [f, fs...] (const DepthwiseArgs &args, const void *os) -> bool {
+    return f(args, os) && make_constraint(fs...)(args, os);
+  };
+}
+
+template <typename OutputStage=Nothing, typename ... Fs>
+ConstraintFn<OutputStage> constraint(Fs ... fs)
+{
+  return [fs...] (const DepthwiseArgs &args, const OutputStage &os) -> bool {
+    return make_constraint(fs...)(args, &os);
+  };
+}
+
+// Some useful constraints
+template <class Strategy>
+bool is_supported(const DepthwiseArgs &args, const void *)
+{
+  return ((args.kernel_rows == Strategy::kernel_rows) &&
+          (args.kernel_cols == Strategy::kernel_cols) &&
+          (args.stride_rows == Strategy::stride_rows) &&
+          (args.stride_cols == Strategy::stride_cols));
+}
+
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *)
+{
+  return args.cpu_info->has_dotprod();
+}
+
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+  return args.channel_multiplier == 1;
+}
+
+bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return qp->per_channel_requant ?
+    (qp->per_channel_left_shifts == nullptr) :
+    (qp->per_layer_left_shift == 0);
+}
+
+}  // namespace
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
new file mode 100644
index 0000000..40370fe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp

@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst_quantized.hpp"
+#include "depthwise_depthfirst_generic_quantized.hpp"
+#include "depthwise_depthfirst_multiplier_quantized.hpp"
+#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+
+bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
+{
+  const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+  return qp->b_offset == 0;
+}
+
+}
+
+static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift,
+                             qp_weights_are_symmetric),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstWithMultiplierQuantized<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstWithMultiplierQuantized<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+    },
+  },
+#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_weights_are_symmetric,
+                             qp_has_no_left_shift,
+                             cpu_has_dot_product),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift,
+                             cpu_has_dot_product),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_nhwc_generic_output3x3_mla_depthfirst",
+    constraint<Requantize32>(has_no_channel_multiplier),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstGenericQuantized<a64_s8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_dot_product),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstWithMultiplierQuantized<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift,
+                             cpu_has_dot_product),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstWithMultiplierQuantized<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+      return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_s8q_methods;
+}
+
+template UniqueDepthwiseCommon<int8_t, int8_t, int8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
new file mode 100644
index 0000000..3e190d2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp

@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst_quantized.hpp"
+#include "depthwise_depthfirst_generic_quantized.hpp"
+#include "depthwise_depthfirst_multiplier_quantized.hpp"
+#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstWithMultiplierQuantized<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstWithMultiplierQuantized<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+    },
+  },
+#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_nhwc_generic_output3x3_mla_depthfirst",
+    constraint<Requantize32>(has_no_channel_multiplier),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstGenericQuantized<a64_u8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstWithMultiplierQuantized<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+                             cpu_has_dot_product,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstWithMultiplierQuantized<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+      return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, uint8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
new file mode 100644
index 0000000..537a7c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp

@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst_quantized.hpp"
+#include "depthwise_depthfirst_generic_quantized.hpp"
+#include "depthwise_depthfirst_multiplier_quantized.hpp"
+#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif  // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+#endif  // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+    constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+                             has_no_channel_multiplier,
+                             qp_has_no_left_shift),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_nhwc_generic_output3x3_mla_depthfirst",
+    constraint<Requantize32>(has_no_channel_multiplier),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      return new DepthwiseDepthfirstGenericQuantized<a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+    nullptr,
+    nullptr,
+    [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+      return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+    },
+  },
+#endif  // defined(__aarch64__)
+  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+  return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, int8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp
new file mode 100644
index 0000000..6c5ef23
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+#include <cstring>
+
+using namespace arm_gemm;
+
+size_t generic_get_packed_size(
+  const VLType vec_type,
+  const unsigned int acc_depth,
+  const unsigned int kernel_rows,
+  const unsigned int kernel_cols,
+  const unsigned int n_input_channels
+)
+{
+  const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length<int32_t>(vec_type);
+  return arm_gemm::roundup((long unsigned int) n_input_channels, per_iter) * kernel_rows * kernel_cols * sizeof(int8_t);
+}
+
+void generic_pack(
+  const VLType vec_type,
+  const unsigned int acc_depth,
+  const unsigned int kernel_rows,
+  const unsigned int kernel_cols,
+  const unsigned int n_channels,
+  void *_outptr,
+  const void *_weights,
+  size_t ld_weight_col,
+  size_t ld_weight_row
+)
+{
+  int8_t *outptr = reinterpret_cast<int8_t *>(_outptr);
+  const int8_t *weights = reinterpret_cast<const int8_t *>(_weights);
+
+  // Get the strides
+  ld_weight_col = (ld_weight_col == 0) ? n_channels * sizeof(int8_t) : ld_weight_col;
+  ld_weight_row = (ld_weight_row == 0) ? kernel_cols * ld_weight_col : ld_weight_row;
+
+  // Pack into per-iter chunks.
+  const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length<int32_t>(vec_type);
+  for (unsigned int c = 0; c < n_channels; c += per_iter)
+  {
+    auto weight_row = weights + c;
+    const auto to_copy = std::min<unsigned int>(per_iter, n_channels - c);
+
+    for (unsigned int i = 0; i < kernel_rows; i++)
+    {
+      auto weight_col = weight_row;
+
+      for (unsigned int j = 0; j < kernel_cols; j++)
+      {
+        memcpy(outptr, weight_col, to_copy);
+        outptr += per_iter;
+        weight_col += ld_weight_col;
+      }
+
+      weight_row += ld_weight_row;
+    }
+  }
+}
+
+
+#define ADD_IMPLEMENTATION(ARCH, TYPENAME, TYPE, VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS) \
+struct interleave_  ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla \
+{ \
+  static size_t get_packed_size(const DepthwiseArgs &args); \
+  static void pack_parameters( \
+    unsigned int n_channels, void *outptr, \
+    const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row \
+  ); \
+}; \
+\
+size_t interleave_  ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::get_packed_size(const DepthwiseArgs &args) \
+{ \
+  return generic_get_packed_size(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, args.input_channels); \
+} \
+\
+void interleave_  ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::pack_parameters(unsigned int n_channels, void *outptr, \
+                            const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row) \
+{ \
+  generic_pack(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, n_channels, outptr, weights, ld_weight_col, ld_weight_row); \
+}
+
+
+namespace arm_conv {
+namespace depthwise {
+
+#if defined(__ARM_FEATURE_SVE)
+
+ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 3, 3)
+ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 5, 5)
+ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 3, 3)
+ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 5, 5)
+
+#endif  // defined(__ARM_FEATURE_SVE)
+
+ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 3, 3)
+ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 5, 5)
+ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 3, 3)
+ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 5, 5)
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
new file mode 100644
index 0000000..3d3447b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp

@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_s8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels,
+                       get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+  );
+  return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "movi v0.16b, #0x0\n"
+    "cmp %x[ld_weight_col], XZR\n"
+    "movi v31.16b, #0x1\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "movi v16.4s, #0x9\n"
+    "mov x19, #0x3\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "mul x19, %x[ld_weight_col], x19\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n"
+    "add x24, %x[weights], %x[ld_weight_row]\n"
+    "add x23, x24, %x[ld_weight_row]\n"
+    "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "lsr x20, %x[n_channels], #0x2\n"
+    "mov x21, #0x0\n"
+    "add x19, %x[qp], %[offsetof_input_offset]\n"
+    "ld1r { v30.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_weights_offset]\n"
+    "ld1r { v29.4s }, [x19]\n"
+    "mul v29.4s, v29.4s, v30.4s\n"
+    "add x19, %x[qp], %[offsetof_per_layer_mul]\n"
+    "ld1r { v28.4s }, [x19]\n"
+    "mul v29.4s, v29.4s, v16.4s\n"
+    "add x19, %x[qp], %[offsetof_per_layer_right_shift]\n"
+    "ld1r { v27.4s }, [x19]\n"
+    "cbz x20, 4f\n"
+    "1:"  // Loop
+    "movi v26.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q26, [%x[bias], x21]\n"
+    "2:"  // Loop: Skip bias load
+    "movi v25.4s, #0x0\n"
+    "ldr s24, [%x[weights], #0x0]\n"
+    "ldr s23, [%x[weights], %x[ld_weight_col]]\n"
+    "zip1 v23.16b, v23.16b, v0.16b\n"
+    "ldr s21, [%x[weights], x22]\n"
+    "add %x[weights], %x[weights], #0x4\n"
+    "zip1 v21.16b, v24.16b, v21.16b\n"
+    "ldr s22, [x24, #0x0]\n"
+    "ldr s20, [x24, %x[ld_weight_col]]\n"
+    "zip1 v21.16b, v21.16b, v23.16b\n"
+    "ldr s18, [x24, x22]\n"
+    ".inst 0x4e9597f9  // sdot v25.4s, v31.16b, v21.16b\n"
+    "add x24, x24, #0x4\n"
+    "zip1 v20.16b, v20.16b, v0.16b\n"
+    "ldr s19, [x23, #0x0]\n"
+    "ldr s17, [x23, %x[ld_weight_col]]\n"
+    "zip1 v18.16b, v22.16b, v18.16b\n"
+    "ldr s16, [x23, x22]\n"
+    "zip1 v18.16b, v18.16b, v20.16b\n"
+    "add x23, x23, #0x4\n"
+    ".inst 0x4e9297f9  // sdot v25.4s, v31.16b, v18.16b\n"
+    "zip1 v17.16b, v17.16b, v0.16b\n"
+    "zip1 v16.16b, v19.16b, v16.16b\n"
+    "zip1 v16.16b, v16.16b, v17.16b\n"
+    ".inst 0x4e9097f9  // sdot v25.4s, v31.16b, v16.16b\n"
+    "mls v26.4s, v25.4s, v30.4s\n"
+    "add v26.4s, v26.4s, v29.4s\n"
+    "str q26, [%x[outptr], #0x0]\n"
+    "str q21, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ldr q28, [%x[rq_mul_perchannel], x21]\n"
+    "ldr q27, [%x[rq_shift_perchannel], x21]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "str q28, [%x[outptr], #0x0]\n"
+    "add x21, x21, #0x10\n"
+    "str q27, [%x[outptr], #0x10]\n"
+    "subs x20, x20, #0x1\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0x3\n"
+    "beq 13f\n"
+    "4:"  // Oddments
+    "movi v26.4s, #0x0\n"
+    "cbz %x[bias], 7f\n"
+    "add %x[bias], %x[bias], x21\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
+    "b 6f\n"
+    "5:"  // Oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
+    "6:"  // Oddments: Load bias: Bit 1: End
+
+    "7:"  // Oddments: Skip bias load
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v24.h }[0], [%x[weights]]\n"
+    "ld1 { v22.h }[0], [x24]\n"
+    "add x20, %x[weights], %x[ld_weight_col]\n"
+    "ld1 { v19.h }[0], [x23]\n"
+    "add x19, %x[weights], x22\n"
+    "ld1 { v23.h }[0], [x20]\n"
+    "add %x[weights], %x[weights], #0x2\n"
+    "ld1 { v21.h }[0], [x19]\n"
+    "add x20, x24, %x[ld_weight_col]\n"
+    "add x19, x24, x22\n"
+    "ld1 { v20.h }[0], [x20]\n"
+    "ld1 { v18.h }[0], [x19]\n"
+    "add x24, x24, #0x2\n"
+    "add x19, x23, %x[ld_weight_col]\n"
+    "ld1 { v17.h }[0], [x19]\n"
+    "add x19, x23, x22\n"
+    "ld1 { v16.h }[0], [x19]\n"
+    "add x23, x23, #0x2\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v24.b }[2], [%x[weights]]\n"
+    "ld1 { v22.b }[2], [x24]\n"
+    "add x20, %x[weights], %x[ld_weight_col]\n"
+    "ld1 { v19.b }[2], [x23]\n"
+    "add x19, %x[weights], x22\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[2], [x19]\n"
+    "add x20, x24, %x[ld_weight_col]\n"
+    "add x19, x24, x22\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "ld1 { v18.b }[2], [x19]\n"
+    "add x20, x23, %x[ld_weight_col]\n"
+    "add x19, x23, x22\n"
+    "ld1 { v17.b }[2], [x20]\n"
+    "ld1 { v16.b }[2], [x19]\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load weights: Bit 1: Unset
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v24.b }[0], [%x[weights]]\n"
+    "ld1 { v22.b }[0], [x24]\n"
+    "add x20, %x[weights], %x[ld_weight_col]\n"
+    "ld1 { v19.b }[0], [x23]\n"
+    "add x19, %x[weights], x22\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[0], [x19]\n"
+    "add x20, x24, %x[ld_weight_col]\n"
+    "add x19, x24, x22\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "ld1 { v18.b }[0], [x19]\n"
+    "add x20, x23, %x[ld_weight_col]\n"
+    "add x19, x23, x22\n"
+    "ld1 { v17.b }[0], [x20]\n"
+    "ld1 { v16.b }[0], [x19]\n"
+    "9:"  // Oddments: Load weights: Bit 1: End
+    "zip1 v21.16b, v24.16b, v21.16b\n"
+    "zip1 v23.16b, v23.16b, v0.16b\n"
+    "zip1 v18.16b, v22.16b, v18.16b\n"
+    "zip1 v20.16b, v20.16b, v0.16b\n"
+    "zip1 v16.16b, v19.16b, v16.16b\n"
+    "zip1 v17.16b, v17.16b, v0.16b\n"
+    "zip1 v21.16b, v21.16b, v23.16b\n"
+    "zip1 v18.16b, v18.16b, v20.16b\n"
+    "zip1 v16.16b, v16.16b, v17.16b\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x4e9597f9  // sdot v25.4s, v31.16b, v21.16b\n"
+    ".inst 0x4e9297f9  // sdot v25.4s, v31.16b, v18.16b\n"
+    ".inst 0x4e9097f9  // sdot v25.4s, v31.16b, v16.16b\n"
+    "mls v26.4s, v25.4s, v30.4s\n"
+    "add v26.4s, v26.4s, v29.4s\n"
+    "str q26, [%x[outptr], #0x0]\n"
+    "str q21, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 12f\n"
+    "add x20, %x[rq_mul_perchannel], x21\n"
+    "add x19, %x[rq_shift_perchannel], x21\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v28.d }[0], [x20], #0x8\n"
+    "ld1 { v27.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v28.s }[2], [x20], #0x4\n"
+    "ld1 { v27.s }[2], [x19], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "ld1 { v27.s }[0], [x19], #0x4\n"
+    "11:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+
+    "12:"  // Oddments: Quantisation parameters: Store
+    "str q28, [%x[outptr], #0x0]\n"
+    "str q27, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "13:"  // End
+
+    : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
new file mode 100644
index 0000000..a725dca
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp

@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_u8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels,
+                       get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+  );
+  return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "movi v0.16b, #0x0\n"
+    "cmp %x[ld_weight_col], XZR\n"
+    "movi v31.16b, #0x1\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "movi v16.4s, #0x9\n"
+    "mov x19, #0x3\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "mul x19, %x[ld_weight_col], x19\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n"
+    "add x24, %x[weights], %x[ld_weight_row]\n"
+    "add x23, x24, %x[ld_weight_row]\n"
+    "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "lsr x20, %x[n_channels], #0x2\n"
+    "mov x21, #0x0\n"
+    "add x19, %x[qp], %[offsetof_input_offset]\n"
+    "ld1r { v30.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_weights_offset]\n"
+    "ld1r { v29.4s }, [x19]\n"
+    "mul v29.4s, v29.4s, v30.4s\n"
+    "add x19, %x[qp], %[offsetof_per_layer_mul]\n"
+    "ld1r { v28.4s }, [x19]\n"
+    "mul v29.4s, v29.4s, v16.4s\n"
+    "add x19, %x[qp], %[offsetof_per_layer_right_shift]\n"
+    "ld1r { v27.4s }, [x19]\n"
+    "cbz x20, 4f\n"
+    "1:"  // Loop
+    "movi v26.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q26, [%x[bias], x21]\n"
+    "2:"  // Loop: Skip bias load
+    "movi v25.4s, #0x0\n"
+    "ldr s24, [%x[weights], #0x0]\n"
+    "ldr s23, [%x[weights], %x[ld_weight_col]]\n"
+    "zip1 v23.16b, v23.16b, v0.16b\n"
+    "ldr s21, [%x[weights], x22]\n"
+    "add %x[weights], %x[weights], #0x4\n"
+    "zip1 v21.16b, v24.16b, v21.16b\n"
+    "ldr s22, [x24, #0x0]\n"
+    "ldr s20, [x24, %x[ld_weight_col]]\n"
+    "zip1 v21.16b, v21.16b, v23.16b\n"
+    "ldr s18, [x24, x22]\n"
+    ".inst 0x6e9597f9  // udot v25.4s, v31.16b, v21.16b\n"
+    "add x24, x24, #0x4\n"
+    "zip1 v20.16b, v20.16b, v0.16b\n"
+    "ldr s19, [x23, #0x0]\n"
+    "ldr s17, [x23, %x[ld_weight_col]]\n"
+    "zip1 v18.16b, v22.16b, v18.16b\n"
+    "ldr s16, [x23, x22]\n"
+    "zip1 v18.16b, v18.16b, v20.16b\n"
+    "add x23, x23, #0x4\n"
+    ".inst 0x6e9297f9  // udot v25.4s, v31.16b, v18.16b\n"
+    "zip1 v17.16b, v17.16b, v0.16b\n"
+    "zip1 v16.16b, v19.16b, v16.16b\n"
+    "zip1 v16.16b, v16.16b, v17.16b\n"
+    ".inst 0x6e9097f9  // udot v25.4s, v31.16b, v16.16b\n"
+    "mls v26.4s, v25.4s, v30.4s\n"
+    "add v26.4s, v26.4s, v29.4s\n"
+    "str q26, [%x[outptr], #0x0]\n"
+    "str q21, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ldr q28, [%x[rq_mul_perchannel], x21]\n"
+    "ldr q27, [%x[rq_shift_perchannel], x21]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "str q28, [%x[outptr], #0x0]\n"
+    "add x21, x21, #0x10\n"
+    "str q27, [%x[outptr], #0x10]\n"
+    "subs x20, x20, #0x1\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0x3\n"
+    "beq 13f\n"
+    "4:"  // Oddments
+    "movi v26.4s, #0x0\n"
+    "cbz %x[bias], 7f\n"
+    "add %x[bias], %x[bias], x21\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
+    "b 6f\n"
+    "5:"  // Oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
+    "6:"  // Oddments: Load bias: Bit 1: End
+
+    "7:"  // Oddments: Skip bias load
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v24.h }[0], [%x[weights]]\n"
+    "ld1 { v22.h }[0], [x24]\n"
+    "add x20, %x[weights], %x[ld_weight_col]\n"
+    "ld1 { v19.h }[0], [x23]\n"
+    "add x19, %x[weights], x22\n"
+    "ld1 { v23.h }[0], [x20]\n"
+    "add %x[weights], %x[weights], #0x2\n"
+    "ld1 { v21.h }[0], [x19]\n"
+    "add x20, x24, %x[ld_weight_col]\n"
+    "add x19, x24, x22\n"
+    "ld1 { v20.h }[0], [x20]\n"
+    "ld1 { v18.h }[0], [x19]\n"
+    "add x24, x24, #0x2\n"
+    "add x19, x23, %x[ld_weight_col]\n"
+    "ld1 { v17.h }[0], [x19]\n"
+    "add x19, x23, x22\n"
+    "ld1 { v16.h }[0], [x19]\n"
+    "add x23, x23, #0x2\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v24.b }[2], [%x[weights]]\n"
+    "ld1 { v22.b }[2], [x24]\n"
+    "add x20, %x[weights], %x[ld_weight_col]\n"
+    "ld1 { v19.b }[2], [x23]\n"
+    "add x19, %x[weights], x22\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[2], [x19]\n"
+    "add x20, x24, %x[ld_weight_col]\n"
+    "add x19, x24, x22\n"
+    "ld1 { v20.b }[2], [x20]\n"
+    "ld1 { v18.b }[2], [x19]\n"
+    "add x20, x23, %x[ld_weight_col]\n"
+    "add x19, x23, x22\n"
+    "ld1 { v17.b }[2], [x20]\n"
+    "ld1 { v16.b }[2], [x19]\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load weights: Bit 1: Unset
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v24.b }[0], [%x[weights]]\n"
+    "ld1 { v22.b }[0], [x24]\n"
+    "add x20, %x[weights], %x[ld_weight_col]\n"
+    "ld1 { v19.b }[0], [x23]\n"
+    "add x19, %x[weights], x22\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "add %x[weights], %x[weights], #0x1\n"
+    "ld1 { v21.b }[0], [x19]\n"
+    "add x20, x24, %x[ld_weight_col]\n"
+    "add x19, x24, x22\n"
+    "ld1 { v20.b }[0], [x20]\n"
+    "ld1 { v18.b }[0], [x19]\n"
+    "add x20, x23, %x[ld_weight_col]\n"
+    "add x19, x23, x22\n"
+    "ld1 { v17.b }[0], [x20]\n"
+    "ld1 { v16.b }[0], [x19]\n"
+    "9:"  // Oddments: Load weights: Bit 1: End
+    "zip1 v21.16b, v24.16b, v21.16b\n"
+    "zip1 v23.16b, v23.16b, v0.16b\n"
+    "zip1 v18.16b, v22.16b, v18.16b\n"
+    "zip1 v20.16b, v20.16b, v0.16b\n"
+    "zip1 v16.16b, v19.16b, v16.16b\n"
+    "zip1 v17.16b, v17.16b, v0.16b\n"
+    "zip1 v21.16b, v21.16b, v23.16b\n"
+    "zip1 v18.16b, v18.16b, v20.16b\n"
+    "zip1 v16.16b, v16.16b, v17.16b\n"
+    "movi v25.4s, #0x0\n"
+    ".inst 0x6e9597f9  // udot v25.4s, v31.16b, v21.16b\n"
+    ".inst 0x6e9297f9  // udot v25.4s, v31.16b, v18.16b\n"
+    ".inst 0x6e9097f9  // udot v25.4s, v31.16b, v16.16b\n"
+    "mls v26.4s, v25.4s, v30.4s\n"
+    "add v26.4s, v26.4s, v29.4s\n"
+    "str q26, [%x[outptr], #0x0]\n"
+    "str q21, [%x[outptr], #0x10]\n"
+    "str q18, [%x[outptr], #0x20]\n"
+    "str q16, [%x[outptr], #0x30]\n"
+    "add %x[outptr], %x[outptr], #0x40\n"
+    "cbz %x[rq_mul_perchannel], 12f\n"
+    "add x20, %x[rq_mul_perchannel], x21\n"
+    "add x19, %x[rq_shift_perchannel], x21\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v28.d }[0], [x20], #0x8\n"
+    "ld1 { v27.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v28.s }[2], [x20], #0x4\n"
+    "ld1 { v27.s }[2], [x19], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "ld1 { v27.s }[0], [x19], #0x4\n"
+    "11:"  // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+
+    "12:"  // Oddments: Quantisation parameters: Store
+    "str q28, [%x[outptr], #0x0]\n"
+    "str q27, [%x[outptr], #0x10]\n"
+    "add %x[outptr], %x[outptr], #0x20\n"
+    "13:"  // End
+
+    : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
new file mode 100644
index 0000000..41f0495
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+#if defined(__ARM_FEATURE_SVE)
+
+class interleave_sve_u8q_3x3_dot
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_s8q_3x3_dot
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_u8q_3x3_mla
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_s8q_3x3_mla
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_u8q_5x5_mla
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_s8q_5x5_mla
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+#endif  // defined(__ARM_FEATURE_SVE)
+
+class interleave_a64_u8q_3x3_dot
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_s8q_3x3_dot
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_u8q_3x3_mla
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_s8q_3x3_mla
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_u8q_5x5_mla
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_s8q_5x5_mla
+{
+  public:
+    static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
+    static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
new file mode 100644
index 0000000..ea0c35b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_s8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels,
+                       get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+  );
+  return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "mov z30.b, #0x0\n"
+    "ptrue p2.b\n"
+    "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+    "mov z28.b, #0x1\n"
+    "cmp %x[ld_weight_col], XZR\n"
+    "mov z16.s, #0x9\n"
+    "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "mul z27.s, p2/M, z27.s, z29.s\n"
+    "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+    "mov x19, #0x3\n"
+    "mul z27.s, p2/M, z27.s, z16.s\n"
+    "ld1rw { z25.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+    "mul x19, %x[ld_weight_col], x19\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n"
+    "add x22, %x[weights], %x[ld_weight_row]\n"
+    "add x21, x22, %x[ld_weight_row]\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "mov x20, #0x0\n"
+    "pfalse p8.b\n"
+    "cbz %x[bias], 1f\n"
+    "ptrue p8.s\n"
+    "1:"  // No bias
+
+    "2:"  // Loop
+    "mov z24.s, #0x0\n"
+    "cntp x19, p2, p1.s\n"
+    "and p0.b, p2/Z, p8.b, p1.b\n"
+    "ld1w { z23.s }, p0/Z, [%x[bias], x20, LSL #2]\n"
+    "whilelt p0.b, XZR, x19\n"
+    "ld1b { z17.b }, p0/Z, [%x[weights]]\n"
+    "ld1b { z16.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+    "zip1 z18.b, z16.b, z30.b\n"
+    "ld1b { z16.b }, p0/Z, [%x[weights], x23]\n"
+    "add %x[weights], %x[weights], x19\n"
+    "zip1 z16.b, z17.b, z16.b\n"
+    "ld1b { z22.b }, p0/Z, [x22]\n"
+    "ld1b { z17.b }, p0/Z, [x22, %x[ld_weight_col]]\n"
+    "zip1 z21.b, z16.b, z18.b\n"
+    "ld1b { z16.b }, p0/Z, [x22, x23]\n"
+    "sdot z24.s, z28.b, z21.b\n"
+    "add x22, x22, x19\n"
+    "zip1 z18.b, z17.b, z30.b\n"
+    "ld1b { z20.b }, p0/Z, [x21]\n"
+    "ld1b { z19.b }, p0/Z, [x21, %x[ld_weight_col]]\n"
+    "zip1 z17.b, z22.b, z16.b\n"
+    "ld1b { z16.b }, p0/Z, [x21, x23]\n"
+    "zip1 z18.b, z17.b, z18.b\n"
+    "add x21, x21, x19\n"
+    "zip1 z17.b, z19.b, z30.b\n"
+    "sdot z24.s, z28.b, z18.b\n"
+    "zip1 z16.b, z20.b, z16.b\n"
+    "zip1 z16.b, z16.b, z17.b\n"
+    "sdot z24.s, z28.b, z16.b\n"
+    "mls z23.s, p2/M, z24.s, z29.s\n"
+    "add z23.s, z23.s, z27.s\n"
+    "st1w { z23.s }, p2, [%x[outptr]]\n"
+    "st1b { z21.b }, p2, [%x[outptr], #1, MUL VL]\n"
+    "st1b { z18.b }, p2, [%x[outptr], #2, MUL VL]\n"
+    "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #4\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ld1w { z26.s }, p1/Z, [%x[rq_mul_perchannel], x20, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [%x[rq_shift_perchannel], x20, LSL #2]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "st1w { z26.s }, p2, [%x[outptr]]\n"
+    "incw x20\n"
+    "st1w { z25.s }, p2, [%x[outptr], #1, MUL VL]\n"
+    "whilelt p1.s, x20, %x[n_channels]\n"
+    "addvl %x[outptr], %x[outptr], #2\n"
+    "b.any 2b\n"
+    : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "p0", "p1", "p2", "p8", "x19", "x20", "x21", "x22", "x23", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
new file mode 100644
index 0000000..edd32a4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_u8q_3x3_dot
+{
+  static size_t get_packed_size(const DepthwiseArgs &);
+  static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+  // We store 7 vectors for every <vector_of_ints> of channels.
+  const unsigned int n = arm_gemm::roundup(
+    arm_gemm::iceildiv((long unsigned int) args.input_channels,
+                       get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+  );
+  return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+  __asm__ __volatile__(
+    "mov z30.b, #0x0\n"
+    "ptrue p2.b\n"
+    "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+    "mov z28.b, #0x1\n"
+    "cmp %x[ld_weight_col], XZR\n"
+    "mov z16.s, #0x9\n"
+    "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+    "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+    "mul z27.s, p2/M, z27.s, z29.s\n"
+    "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+    "mov x19, #0x3\n"
+    "mul z27.s, p2/M, z27.s, z16.s\n"
+    "ld1rw { z25.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+    "mul x19, %x[ld_weight_col], x19\n"
+    "cmp %x[ld_weight_row], XZR\n"
+    "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+    "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n"
+    "add x22, %x[weights], %x[ld_weight_row]\n"
+    "add x21, x22, %x[ld_weight_row]\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "mov x20, #0x0\n"
+    "pfalse p8.b\n"
+    "cbz %x[bias], 1f\n"
+    "ptrue p8.s\n"
+    "1:"  // No bias
+
+    "2:"  // Loop
+    "mov z24.s, #0x0\n"
+    "cntp x19, p2, p1.s\n"
+    "and p0.b, p2/Z, p8.b, p1.b\n"
+    "ld1w { z23.s }, p0/Z, [%x[bias], x20, LSL #2]\n"
+    "whilelt p0.b, XZR, x19\n"
+    "ld1b { z17.b }, p0/Z, [%x[weights]]\n"
+    "ld1b { z16.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+    "zip1 z18.b, z16.b, z30.b\n"
+    "ld1b { z16.b }, p0/Z, [%x[weights], x23]\n"
+    "add %x[weights], %x[weights], x19\n"
+    "zip1 z16.b, z17.b, z16.b\n"
+    "ld1b { z22.b }, p0/Z, [x22]\n"
+    "ld1b { z17.b }, p0/Z, [x22, %x[ld_weight_col]]\n"
+    "zip1 z21.b, z16.b, z18.b\n"
+    "ld1b { z16.b }, p0/Z, [x22, x23]\n"
+    "udot z24.s, z28.b, z21.b\n"
+    "add x22, x22, x19\n"
+    "zip1 z18.b, z17.b, z30.b\n"
+    "ld1b { z20.b }, p0/Z, [x21]\n"
+    "ld1b { z19.b }, p0/Z, [x21, %x[ld_weight_col]]\n"
+    "zip1 z17.b, z22.b, z16.b\n"
+    "ld1b { z16.b }, p0/Z, [x21, x23]\n"
+    "zip1 z18.b, z17.b, z18.b\n"
+    "add x21, x21, x19\n"
+    "zip1 z17.b, z19.b, z30.b\n"
+    "udot z24.s, z28.b, z18.b\n"
+    "zip1 z16.b, z20.b, z16.b\n"
+    "zip1 z16.b, z16.b, z17.b\n"
+    "udot z24.s, z28.b, z16.b\n"
+    "mls z23.s, p2/M, z24.s, z29.s\n"
+    "add z23.s, z23.s, z27.s\n"
+    "st1w { z23.s }, p2, [%x[outptr]]\n"
+    "st1b { z21.b }, p2, [%x[outptr], #1, MUL VL]\n"
+    "st1b { z18.b }, p2, [%x[outptr], #2, MUL VL]\n"
+    "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+    "addvl %x[outptr], %x[outptr], #4\n"
+    "cbz %x[rq_mul_perchannel], 3f\n"
+    "ld1w { z26.s }, p1/Z, [%x[rq_mul_perchannel], x20, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [%x[rq_shift_perchannel], x20, LSL #2]\n"
+    "3:"  // Loop: Quantisation parameters: Store
+    "st1w { z26.s }, p2, [%x[outptr]]\n"
+    "incw x20\n"
+    "st1w { z25.s }, p2, [%x[outptr], #1, MUL VL]\n"
+    "whilelt p1.s, x20, %x[n_channels]\n"
+    "addvl %x[outptr], %x[outptr], #2\n"
+    "b.any 2b\n"
+    : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+    : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "p0", "p1", "p2", "p8", "x19", "x20", "x21", "x22", "x23", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..bb43d57
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..99f4601
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "1:"  // Tile loop
+    "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x15, #0x2\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x24, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x22, #0x0\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x17, x23\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x16, x13, x19\n" // offset += tile_j * ld_input_col
+    "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+    "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x12, x12, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1r { v18.8h }, [x24]\n"
+    "add x9, x12, x23, LSL #1\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "add x28, x9, x23, LSL #1\n"
+    "lsl x13, x13, #0x1\n"
+    "add x27, x28, x23, LSL #1\n"
+    "add x26, x13, x13\n"
+    "add x25, x26, x13\n"
+    "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x16, x11, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x15\n" // offset *= output_tile_size
+    "add x10, x10, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "add x24, x10, x20, LSL #1\n"
+    "lsl x11, x11, #0x1\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x3\n"
+    "cbz x19, 4f\n"
+    "ldr q16, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldr q9, [x9, x13]\n"
+    "ld1 { v10.8h }, [x12]\n"
+    "ldr q11, [x12, x25]\n"
+    "ldr q12, [x9, x26]\n"
+    "ldr q13, [x28, x13]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+    "add x22, x22, #0x10\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "add x21, x21, #0x10\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x27]\n"
+    "cmp x21, x19, LSL #4\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x28, x26]\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x27, x25]\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "ldr q16, [x14, #0x0]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v31.8h, v5.8h, v12.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x12, x13]\n"
+    "fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr q9, [x12, x26]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "ld1 { v11.8h }, [x9]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x9, x25]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x14, #0x50]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x28]\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v31.8h, v8.8h, v10.8h\n"
+    "fmla v30.8h, v7.8h, v10.8h\n"
+    "ldr q10, [x28, x25]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "ldr q13, [x28, x13]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x27, x13]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q12, [x27, x26]\n"
+    "add x27, x27, #0x10\n"
+    "fmla v28.8h, v5.8h, v10.8h\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v31.8h, v6.8h, v9.8h\n"
+    "ldr q9, [x9, x13]\n"
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x12]\n"
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "fmla v28.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x12, x25]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "ldr q12, [x9, x26]\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "ldr q7, [x14, #0x80]\n"
+    "add x14, x14, #0xa0\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "st1 { v31.8h }, [x10]\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "str q30, [x10, x11]\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "st1 { v29.8h }, [x24]\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "add x10, x10, #0x10\n"
+    "str q28, [x24, x11]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x27]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x28, x26]\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x27, x25]\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v31.8h, v5.8h, v12.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x12, x13]\n"
+    "fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr q9, [x12, x26]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "ld1 { v11.8h }, [x9]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x9, x25]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x28]\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "fmla v31.8h, v8.8h, v10.8h\n"
+    "fmla v30.8h, v7.8h, v10.8h\n"
+    "ldr q10, [x28, x25]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x27, x13]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q12, [x27, x26]\n"
+    "add x27, x27, #0x10\n"
+    "fmla v28.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v6.8h, v9.8h\n"
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "fmla v28.8h, v6.8h, v11.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "st1 { v31.8h }, [x10]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "str q30, [x10, x11]\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "add x10, x10, #0x10\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "st1 { v29.8h }, [x24]\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "str q28, [x24, x11]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 31f\n"
+    "ldr q16, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "add x23, x9, x13\n"
+    "ldr q1, [x14, #0x20]\n"
+    "add x22, x12, XZR\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x21, x12, x25\n"
+    "ldr q3, [x14, #0x40]\n"
+    "add x20, x9, x26\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x19, x28, x13\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr s9, [x23], #0x4\n"
+    "ldr s10, [x22], #0x4\n"
+    "ldr s11, [x21], #0x4\n"
+    "ldr s12, [x20], #0x4\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.h }[2], [x23]\n"
+    "ld1 { v10.h }[2], [x22]\n"
+    "ld1 { v11.h }[2], [x21]\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+    "ldr h9, [x23, #0x0]\n"
+    "ldr h10, [x22, #0x0]\n"
+    "ldr h11, [x21, #0x0]\n"
+    "ldr h12, [x20, #0x0]\n"
+    "ldr h13, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+    "add x19, x27, XZR\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v31.8h, v5.8h, v12.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v29.8h, v6.8h, v9.8h\n"
+    "add x19, x27, x25\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "add x19, x12, x13\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "add x19, x12, x26\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "add x19, x28, x26\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+    "fmla v31.8h, v8.8h, v10.8h\n"
+    "add x19, x9, XZR\n"
+    "fmla v30.8h, v7.8h, v10.8h\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "add x19, x9, x25\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "add x19, x28, XZR\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v31.8h, v6.8h, v9.8h\n"
+    "add x19, x28, x25\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "add x19, x27, x13\n"
+    "fmla v28.8h, v5.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "add x19, x27, x26\n"
+    "fmla v28.8h, v6.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "mov x19, x10\n"
+    "st1 { v31.s }[0], [x19], x11\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v30.s }[0], [x19]\n"
+    "mov x19, x24\n"
+    "st1 { v29.s }[0], [x19], x11\n"
+    "add x24, x24, #0x4\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "mov x20, x10\n"
+    "st1 { v31.h }[2], [x20], x11\n"
+    "mov x19, x24\n"
+    "st1 { v30.h }[2], [x20]\n"
+    "st1 { v29.h }[2], [x19], x11\n"
+    "st1 { v28.h }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x20, x10\n"
+    "st1 { v31.h }[0], [x20], x11\n"
+    "mov x19, x24\n"
+    "st1 { v30.h }[0], [x20]\n"
+    "st1 { v29.h }[0], [x19], x11\n"
+    "st1 { v28.h }[0], [x19]\n"
+    "30:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "31:"  // Tile loop: End
+    "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x17, #0x1\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x16, x16, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x16, x19\n"
+    "csel x16, x16, XZR, LT\n"
+    "csel x17, x17, x21, LT\n"
+    "cmp x17, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..af83238
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[16];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v18.8h }, [x20]\n"
+    "ld1r { v17.8h }, [x19]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x21, #0x0]\n"
+    "mov x11, #0x10\n" // cntb _, ALL, #1
+    "ldp x10, x9, [x21, #0x10]\n"
+    "sub x28, XZR, x11\n"
+    "lsr x27, %x[n_channels], #0x3\n"
+    "cbz x27, 3f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x11, x27, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldr x22, [x16, #0x20]\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr q10, [x25, x14]\n"
+    "ldr q11, [x24, x14]\n"
+    "ldr q12, [x23, x14]\n"
+    "ldr q13, [x22, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+    "ldr x20, [x16, #0x30]\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "ldr x19, [x16, #0x38]\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x21, x14]\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr x25, [x16, #0x48]\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q10, [x25, x14]\n"
+    "ldr x24, [x16, #0x50]\n"
+    "fmla v31.8h, v5.8h, v12.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr q12, [x19, x14]\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldr q16, [x15, #0x0]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q4, [x15, #0x50]\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q9, [x22, x14]\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "ldr x22, [x16, #0x20]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v31.8h, v8.8h, v10.8h\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v30.8h, v7.8h, v10.8h\n"
+    "ldr q10, [x21, x14]\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "ldr q13, [x22, x11]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x20, x14]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "fmla v28.8h, v5.8h, v10.8h\n"
+    "ldr q12, [x19, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.8h, v6.8h, v9.8h\n"
+    "ldr q9, [x26, x11]\n"
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "ldr q10, [x25, x11]\n"
+    "fmla v28.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x24, x11]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "ldr q8, [x15, #0x90]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "ldr q12, [x23, x11]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "ldr q7, [x15, #0x80]\n"
+    "cmp x11, x27, LSL #4\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "str q31, [x13, x28]\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "add x15, x15, #0xa0\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "str q30, [x12, x28]\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "str q29, [x10, x28]\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "str q28, [x9, x28]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+    "ldr x20, [x16, #0x30]\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "ldr x19, [x16, #0x38]\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x21, x14]\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr x25, [x16, #0x48]\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q10, [x25, x14]\n"
+    "ldr x24, [x16, #0x50]\n"
+    "fmla v31.8h, v5.8h, v12.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr q12, [x19, x14]\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "ldr q9, [x22, x14]\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "fmla v31.8h, v8.8h, v10.8h\n"
+    "fmla v30.8h, v7.8h, v10.8h\n"
+    "ldr q10, [x21, x14]\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q12, [x19, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.8h, v5.8h, v10.8h\n"
+    "fmla v31.8h, v6.8h, v9.8h\n"
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "fmla v28.8h, v6.8h, v11.8h\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "str q31, [x13, x28]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "str q30, [x12, x28]\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "str q29, [x10, x28]\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "str q28, [x9, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 30f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x28, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "add x13, x13, x28\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x12, x12, x28\n"
+    "ldr q3, [x15, #0x40]\n"
+    "add x10, x10, x28\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x9, x9, x28\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "ldr x26, [x16, #0x0]\n"
+    "ldr x25, [x16, #0x8]\n"
+    "add x26, x26, x14\n"
+    "ldr x24, [x16, #0x10]\n"
+    "ldr x23, [x16, #0x18]\n"
+    "add x25, x25, x14\n"
+    "ldr x22, [x16, #0x20]\n"
+    "add x24, x24, x14\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v13.s }[0], [x22], #0x4\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.h }[2], [x26], #0x2\n"
+    "ld1 { v10.h }[2], [x25], #0x2\n"
+    "ld1 { v11.h }[2], [x24], #0x2\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "ld1 { v13.h }[2], [x22], #0x2\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+    "ld1 { v9.h }[0], [x26], #0x2\n"
+    "ld1 { v10.h }[0], [x25], #0x2\n"
+    "ld1 { v11.h }[0], [x24], #0x2\n"
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "ld1 { v13.h }[0], [x22], #0x2\n"
+    "5:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x21, x21, x14\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v31.8h, v5.8h, v12.8h\n"
+    "fmla v30.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.s }[0], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.h }[2], [x21], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v9.h }[0], [x21], #0x2\n"
+    "7:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr x20, [x16, #0x30]\n"
+    "fmla v31.8h, v7.8h, v13.8h\n"
+    "add x20, x20, x14\n"
+    "fmla v30.8h, v6.8h, v13.8h\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "9:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "ldr x19, [x16, #0x38]\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v12.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.h }[2], [x19], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v12.h }[0], [x19], #0x2\n"
+    "11:"  // Oddments: Load input (0, 1): Bit 1: End
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v9.h }[2], [x26], #0x2\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v9.h }[0], [x26], #0x2\n"
+    "13:"  // Oddments: Load input (0, 2): Bit 1: End
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr x25, [x16, #0x48]\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v10.h }[2], [x25], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 2): Bit 1: Unset
+    "ld1 { v10.h }[0], [x25], #0x2\n"
+    "15:"  // Oddments: Load input (2, 2): Bit 1: End
+    "fmla v31.8h, v8.8h, v10.8h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "fmla v30.8h, v7.8h, v10.8h\n"
+    "add x24, x24, x14\n"
+    "fmla v29.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.h }[2], [x24], #0x2\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v11.h }[0], [x24], #0x2\n"
+    "17:"  // Oddments: Load input (1, 0): Bit 1: End
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "19:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v28.8h, v2.8h, v12.8h\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "21:"  // Oddments: Load input (2, 0): Bit 1: End
+    "fmla v31.8h, v6.8h, v9.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "add x21, x21, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v10.s }[0], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v10.h }[2], [x21], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v10.h }[0], [x21], #0x2\n"
+    "23:"  // Oddments: Load input (2, 3): Bit 1: End
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v28.8h, v5.8h, v10.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "25:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v29.8h, v7.8h, v11.8h\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v28.8h, v6.8h, v11.8h\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v12.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.h }[2], [x19], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v12.h }[0], [x19], #0x2\n"
+    "27:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "st1 { v31.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x10], #0x4\n"
+    "st1 { v28.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "st1 { v31.h }[2], [x13], #0x2\n"
+    "st1 { v30.h }[2], [x12], #0x2\n"
+    "st1 { v29.h }[2], [x10], #0x2\n"
+    "st1 { v28.h }[2], [x9], #0x2\n"
+    "b 29f\n"
+    "28:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v31.h }[0], [x13], #0x2\n"
+    "st1 { v30.h }[0], [x12], #0x2\n"
+    "st1 { v29.h }[0], [x10], #0x2\n"
+    "st1 { v28.h }[0], [x9], #0x2\n"
+    "29:"  // Oddments: Store: Bit 1: End
+
+    "30:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000..90db870
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..3bdd544
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x7, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x3\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x25, #0x3\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x24, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x23, #0x0\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x8, x16, x19\n" // offset += tile_j * ld_input_col
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x15, x15, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1r { v18.8h }, [x24]\n"
+    "add x12, x15, x22, LSL #1\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "add x11, x12, x22, LSL #1\n"
+    "lsl x16, x16, #0x1\n"
+    "add x10, x11, x22, LSL #1\n"
+    "add x9, x10, x22, LSL #1\n"
+    "add x28, x16, x16\n"
+    "add x27, x28, x16\n"
+    "add x26, x27, x16\n"
+    "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x8, x14, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x25\n" // offset *= output_tile_size
+    "add x13, x13, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "add x25, x13, x20, LSL #1\n"
+    "add x24, x25, x20, LSL #1\n"
+    "lsl x14, x14, #0x1\n"
+    "add x22, x14, x14\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x3\n"
+    "cbz x19, 4f\n"
+    "ldr q16, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "add x17, x17, #0xa0\n"
+    "ldr q9, [x11, x28]\n"
+    "ld1 { v10.8h }, [x15]\n"
+    "ldr q11, [x15, x26]\n"
+    "ld1 { v12.8h }, [x9]\n"
+    "ldr q13, [x12, x28]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "add x23, x23, #0x10\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "add x21, x21, #0x10\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+    "cmp x21, x19, LSL #4\n"
+    "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+    "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+    "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+    "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+    "ldr q16, [x17, #0x0]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x11, x27]\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x11, x16]\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x9, x26]\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "fmla v26.8h, v0.8h, v13.8h\n"
+    "ldr q13, [x15, x16]\n"
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "ldr q12, [x15, x27]\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "fmla v30.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v25.8h, v1.8h, v11.8h\n"
+    "fmla v24.8h, v0.8h, v11.8h\n"
+    "ld1 { v11.8h }, [x12]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "ldr q13, [x12, x26]\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "ld1 { v12.8h }, [x10]\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "fmla v24.8h, v2.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "ldr q10, [x10, x28]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x10, x26]\n"
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x9, x16]\n"
+    "fmla v25.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x12, x16]\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmla v26.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v24.8h, v4.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v26.8h, v8.8h, v11.8h\n"
+    "fmla v25.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v6.8h, v13.8h\n"
+    "ldr q13, [x9, x27]\n"
+    "fmla v23.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x12, x27]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x10, x16]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v5.8h, v11.8h\n"
+    "fmla v26.8h, v1.8h, v11.8h\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x15, x28]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v24.8h, v8.8h, v13.8h\n"
+    "ld1 { v10.8h }, [x15]\n"
+    "fmla v23.8h, v7.8h, v13.8h\n"
+    "ldr q13, [x10, x27]\n"
+    "add x10, x10, #0x10\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "ld1 { v12.8h }, [x11]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "ldr q1, [x17, #0x20]\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x11, x26]\n"
+    "add x11, x11, #0x10\n"
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "ldr q9, [x11, x28]\n"
+    "fmla v26.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v5.8h, v13.8h\n"
+    "fmla v23.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x9, x28]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "ldr q4, [x17, #0x50]\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr q3, [x17, #0x40]\n"
+    "fmla v25.8h, v0.8h, v12.8h\n"
+    "ld1 { v12.8h }, [x9]\n"
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "ldr q0, [x17, #0x10]\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "ldr q5, [x17, #0x60]\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x15, x26]\n"
+    "fmla v25.8h, v8.8h, v13.8h\n"
+    "ldr q2, [x17, #0x30]\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "ldr q7, [x17, #0x80]\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "ldr q8, [x17, #0x90]\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "ldr q13, [x12, x28]\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "ldr q6, [x17, #0x70]\n"
+    "add x17, x17, #0xa0\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "st1 { v31.8h }, [x13]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "str q30, [x13, x14]\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "str q29, [x13, x22]\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmax v27.8h, v27.8h, v18.8h\n"
+    "st1 { v28.8h }, [x25]\n"
+    "fmax v26.8h, v26.8h, v18.8h\n"
+    "fmax v25.8h, v25.8h, v18.8h\n"
+    "fmin v27.8h, v27.8h, v17.8h\n"
+    "str q27, [x25, x14]\n"
+    "fmin v26.8h, v26.8h, v17.8h\n"
+    "fmin v25.8h, v25.8h, v17.8h\n"
+    "str q26, [x25, x22]\n"
+    "fmax v24.8h, v24.8h, v18.8h\n"
+    "add x25, x25, #0x10\n"
+    "fmax v23.8h, v23.8h, v18.8h\n"
+    "st1 { v25.8h }, [x24]\n"
+    "fmin v24.8h, v24.8h, v17.8h\n"
+    "str q24, [x24, x14]\n"
+    "fmin v23.8h, v23.8h, v17.8h\n"
+    "str q23, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+    "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+    "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+    "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x11, x27]\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x11, x16]\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x9, x26]\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "fmla v26.8h, v0.8h, v13.8h\n"
+    "ldr q13, [x15, x16]\n"
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "ldr q12, [x15, x27]\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "fmla v30.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v25.8h, v1.8h, v11.8h\n"
+    "fmla v24.8h, v0.8h, v11.8h\n"
+    "ld1 { v11.8h }, [x12]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "ldr q13, [x12, x26]\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "ld1 { v12.8h }, [x10]\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "fmla v24.8h, v2.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "ldr q10, [x10, x28]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x10, x26]\n"
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x9, x16]\n"
+    "fmla v25.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x12, x16]\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmla v26.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v24.8h, v4.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v26.8h, v8.8h, v11.8h\n"
+    "fmla v25.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v6.8h, v13.8h\n"
+    "ldr q13, [x9, x27]\n"
+    "fmla v23.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x12, x27]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x10, x16]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v5.8h, v11.8h\n"
+    "fmla v26.8h, v1.8h, v11.8h\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x15, x28]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v24.8h, v8.8h, v13.8h\n"
+    "fmla v23.8h, v7.8h, v13.8h\n"
+    "ldr q13, [x10, x27]\n"
+    "add x10, x10, #0x10\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "ld1 { v12.8h }, [x11]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x11, x26]\n"
+    "add x11, x11, #0x10\n"
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "fmla v26.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v5.8h, v13.8h\n"
+    "fmla v23.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x9, x28]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v0.8h, v12.8h\n"
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "fmla v25.8h, v8.8h, v13.8h\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "st1 { v31.8h }, [x13]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "str q30, [x13, x14]\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "str q29, [x13, x22]\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "add x13, x13, #0x10\n"
+    "fmax v27.8h, v27.8h, v18.8h\n"
+    "st1 { v28.8h }, [x25]\n"
+    "fmax v26.8h, v26.8h, v18.8h\n"
+    "fmax v25.8h, v25.8h, v18.8h\n"
+    "fmin v27.8h, v27.8h, v17.8h\n"
+    "str q27, [x25, x14]\n"
+    "fmin v26.8h, v26.8h, v17.8h\n"
+    "fmin v25.8h, v25.8h, v17.8h\n"
+    "str q26, [x25, x22]\n"
+    "fmax v24.8h, v24.8h, v18.8h\n"
+    "add x25, x25, #0x10\n"
+    "fmax v23.8h, v23.8h, v18.8h\n"
+    "st1 { v25.8h }, [x24]\n"
+    "fmin v24.8h, v24.8h, v17.8h\n"
+    "str q24, [x24, x14]\n"
+    "fmin v23.8h, v23.8h, v17.8h\n"
+    "str q23, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 49f\n"
+    "ldr q16, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "add x23, x11, x28\n"
+    "ldr q1, [x17, #0x20]\n"
+    "add x22, x15, XZR\n"
+    "ldr q2, [x17, #0x30]\n"
+    "add x21, x15, x26\n"
+    "ldr q3, [x17, #0x40]\n"
+    "add x20, x9, XZR\n"
+    "ldr q4, [x17, #0x50]\n"
+    "add x19, x12, x28\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr s9, [x23], #0x4\n"
+    "ldr s10, [x22], #0x4\n"
+    "ldr s11, [x21], #0x4\n"
+    "ldr s12, [x20], #0x4\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.h }[2], [x23]\n"
+    "ld1 { v10.h }[2], [x22]\n"
+    "ld1 { v11.h }[2], [x21]\n"
+    "ld1 { v12.h }[2], [x20]\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+    "ldr h9, [x23, #0x0]\n"
+    "ldr h10, [x22, #0x0]\n"
+    "ldr h11, [x21, #0x0]\n"
+    "ldr h12, [x20, #0x0]\n"
+    "ldr h13, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "add x19, x9, x26\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+    "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+    "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+    "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "fmla v26.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "add x19, x11, x16\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "add x19, x15, x16\n"
+    "fmla v30.8h, v6.8h, v11.8h\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v25.8h, v1.8h, v11.8h\n"
+    "fmla v24.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "add x19, x15, x27\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "add x19, x11, x27\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "add x19, x12, XZR\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v24.8h, v2.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "add x19, x12, x26\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "add x19, x10, XZR\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "add x19, x10, x28\n"
+    "fmla v25.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "add x19, x10, x26\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "fmla v26.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v24.8h, v4.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v26.8h, v8.8h, v11.8h\n"
+    "add x19, x9, x16\n"
+    "fmla v23.8h, v5.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v25.8h, v7.8h, v13.8h\n"
+    "add x19, x12, x16\n"
+    "fmla v24.8h, v6.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "add x19, x12, x27\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v30.8h, v5.8h, v11.8h\n"
+    "add x19, x9, x27\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "fmla v26.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v24.8h, v8.8h, v13.8h\n"
+    "add x19, x10, x16\n"
+    "fmla v23.8h, v7.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "add x19, x15, x28\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "add x19, x10, x27\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "add x19, x11, XZR\n"
+    "fmla v26.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v5.8h, v13.8h\n"
+    "fmla v23.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "add x19, x11, x26\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "add x19, x9, x28\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v25.8h, v8.8h, v13.8h\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "fmax v27.8h, v27.8h, v18.8h\n"
+    "fmax v26.8h, v26.8h, v18.8h\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "fmin v27.8h, v27.8h, v17.8h\n"
+    "fmin v26.8h, v26.8h, v17.8h\n"
+    "fmax v25.8h, v25.8h, v18.8h\n"
+    "fmax v24.8h, v24.8h, v18.8h\n"
+    "fmax v23.8h, v23.8h, v18.8h\n"
+    "fmin v25.8h, v25.8h, v17.8h\n"
+    "fmin v24.8h, v24.8h, v17.8h\n"
+    "fmin v23.8h, v23.8h, v17.8h\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "mov x19, x13\n"
+    "st1 { v31.s }[0], [x19], x14\n"
+    "add x13, x13, #0x4\n"
+    "st1 { v30.s }[0], [x19], x14\n"
+    "mov x20, x25\n"
+    "st1 { v29.s }[0], [x19]\n"
+    "st1 { v28.s }[0], [x20], x14\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v27.s }[0], [x20], x14\n"
+    "mov x19, x24\n"
+    "st1 { v26.s }[0], [x20]\n"
+    "add x24, x24, #0x4\n"
+    "st1 { v25.s }[0], [x19], x14\n"
+    "st1 { v24.s }[0], [x19], x14\n"
+    "st1 { v23.s }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "mov x21, x13\n"
+    "st1 { v31.h }[2], [x21], x14\n"
+    "mov x20, x25\n"
+    "st1 { v30.h }[2], [x21], x14\n"
+    "st1 { v28.h }[2], [x20], x14\n"
+    "mov x19, x24\n"
+    "st1 { v29.h }[2], [x21]\n"
+    "st1 { v27.h }[2], [x20], x14\n"
+    "st1 { v26.h }[2], [x20]\n"
+    "st1 { v25.h }[2], [x19], x14\n"
+    "st1 { v24.h }[2], [x19], x14\n"
+    "st1 { v23.h }[2], [x19]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x21, x13\n"
+    "st1 { v31.h }[0], [x21], x14\n"
+    "mov x20, x25\n"
+    "mov x19, x24\n"
+    "st1 { v30.h }[0], [x21], x14\n"
+    "st1 { v28.h }[0], [x20], x14\n"
+    "st1 { v29.h }[0], [x21]\n"
+    "st1 { v27.h }[0], [x20], x14\n"
+    "st1 { v26.h }[0], [x20]\n"
+    "st1 { v25.h }[0], [x19], x14\n"
+    "st1 { v24.h }[0], [x19], x14\n"
+    "st1 { v23.h }[0], [x19]\n"
+    "48:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "49:"  // Tile loop: End
+    "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x7, #0x1\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x8, x8, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x8, x19\n"
+    "csel x8, x8, XZR, LT\n"
+    "csel x7, x7, x21, LT\n"
+    "cmp x7, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..ed47c30
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v18.8h }, [x20]\n"
+    "ld1r { v17.8h }, [x19]\n"
+    "mov x14, #0x0\n"
+    "mov x13, #0x10\n" // cntb _, ALL, #1
+    "sub x12, XZR, x13\n"
+    "lsr x11, %x[n_channels], #0x3\n"
+    "cbz x11, 3f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x13, x11, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldp x10, x9, [x16, #0x0]\n"
+    "ldp x28, x27, [x16, #0x10]\n"
+    "ldr x26, [x16, #0x20]\n"
+    "ldr q9, [x10, x14]\n"
+    "ldr q10, [x9, x14]\n"
+    "ldr q11, [x28, x14]\n"
+    "ldr q12, [x27, x14]\n"
+    "ldr q13, [x26, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "add x12, x12, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr x23, [x16, #0x38]\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+    "ldr x10, [x16, #0x40]\n"
+    "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "ldr x9, [x16, #0x48]\n"
+    "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+    "ldr x28, [x16, #0x50]\n"
+    "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+    "ldr x27, [x16, #0x58]\n"
+    "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+    "ldr x26, [x16, #0x60]\n"
+    "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x9, x14]\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x25, x14]\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "ldr x21, [x17, #0x8]\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla v26.8h, v0.8h, v13.8h\n"
+    "ldr q13, [x23, x14]\n"
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "ldr q12, [x10, x14]\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v30.8h, v6.8h, v11.8h\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "ldr q16, [x15, #0x0]\n"
+    "fmla v25.8h, v1.8h, v11.8h\n"
+    "fmla v24.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x28, x14]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "ldr q13, [x27, x14]\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x26, x14]\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v24.8h, v2.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x23, x14]\n"
+    "fmla v25.8h, v3.8h, v12.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x10, x14]\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v26.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v24.8h, v4.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v26.8h, v8.8h, v11.8h\n"
+    "fmla v25.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v6.8h, v13.8h\n"
+    "ldr q13, [x28, x14]\n"
+    "fmla v23.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x9, x14]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v5.8h, v11.8h\n"
+    "fmla v26.8h, v1.8h, v11.8h\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x26, x14]\n"
+    "fmla v24.8h, v8.8h, v13.8h\n"
+    "ldr x26, [x16, #0x20]\n"
+    "fmla v23.8h, v7.8h, v13.8h\n"
+    "ldr q13, [x25, x14]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x23, x14]\n"
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "fmla v26.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v5.8h, v13.8h\n"
+    "fmla v23.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x10, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "ldp x10, x9, [x16, #0x0]\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldp x28, x27, [x16, #0x10]\n"
+    "fmla v25.8h, v0.8h, v12.8h\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "ldr q9, [x10, x13]\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "ldr q10, [x9, x13]\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x28, x13]\n"
+    "fmla v25.8h, v8.8h, v13.8h\n"
+    "ldr q12, [x27, x13]\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "ldr q13, [x26, x13]\n"
+    "add x13, x13, #0x10\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "ldr q4, [x15, #0x50]\n"
+    "cmp x13, x11, LSL #4\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "str q31, [x22, x12]\n"
+    "fmax v27.8h, v27.8h, v18.8h\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "ldr q7, [x15, #0x80]\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "str q30, [x21, x12]\n"
+    "fmin v27.8h, v27.8h, v17.8h\n"
+    "str q29, [x20, x12]\n"
+    "fmax v26.8h, v26.8h, v18.8h\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmax v25.8h, v25.8h, v18.8h\n"
+    "str q28, [x19, x12]\n"
+    "fmax v24.8h, v24.8h, v18.8h\n"
+    "str q27, [x22, x12]\n"
+    "fmin v26.8h, v26.8h, v17.8h\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmin v25.8h, v25.8h, v17.8h\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin v24.8h, v24.8h, v17.8h\n"
+    "str q26, [x21, x12]\n"
+    "fmax v23.8h, v23.8h, v18.8h\n"
+    "str q25, [x20, x12]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin v23.8h, v23.8h, v17.8h\n"
+    "str q24, [x19, x12]\n"
+    "str q23, [x22, x12]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "add x12, x12, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr x23, [x16, #0x38]\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+    "ldr x10, [x16, #0x40]\n"
+    "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "ldr x9, [x16, #0x48]\n"
+    "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+    "ldr x28, [x16, #0x50]\n"
+    "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+    "ldr x27, [x16, #0x58]\n"
+    "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+    "ldr x26, [x16, #0x60]\n"
+    "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x9, x14]\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x25, x14]\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "ldr x21, [x17, #0x8]\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla v26.8h, v0.8h, v13.8h\n"
+    "ldr q13, [x23, x14]\n"
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "ldr q12, [x10, x14]\n"
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v30.8h, v6.8h, v11.8h\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v25.8h, v1.8h, v11.8h\n"
+    "fmla v24.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x28, x14]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "ldr q13, [x27, x14]\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x26, x14]\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v24.8h, v2.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x23, x14]\n"
+    "fmla v25.8h, v3.8h, v12.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x10, x14]\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v26.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "fmla v24.8h, v4.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v26.8h, v8.8h, v11.8h\n"
+    "fmla v25.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v6.8h, v13.8h\n"
+    "ldr q13, [x28, x14]\n"
+    "fmla v23.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x9, x14]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "fmla v30.8h, v5.8h, v11.8h\n"
+    "fmla v26.8h, v1.8h, v11.8h\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x26, x14]\n"
+    "fmla v24.8h, v8.8h, v13.8h\n"
+    "fmla v23.8h, v7.8h, v13.8h\n"
+    "ldr q13, [x25, x14]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x23, x14]\n"
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "fmla v26.8h, v7.8h, v13.8h\n"
+    "fmla v24.8h, v5.8h, v13.8h\n"
+    "fmla v23.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x10, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v25.8h, v0.8h, v12.8h\n"
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "fmla v25.8h, v8.8h, v13.8h\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "str q31, [x22, x12]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "str q30, [x21, x12]\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmax v27.8h, v27.8h, v18.8h\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "str q29, [x20, x12]\n"
+    "fmin v27.8h, v27.8h, v17.8h\n"
+    "fmax v26.8h, v26.8h, v18.8h\n"
+    "str q28, [x19, x12]\n"
+    "fmax v25.8h, v25.8h, v18.8h\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmax v24.8h, v24.8h, v18.8h\n"
+    "str q27, [x22, x12]\n"
+    "fmin v26.8h, v26.8h, v17.8h\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin v25.8h, v25.8h, v17.8h\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin v24.8h, v24.8h, v17.8h\n"
+    "str q26, [x21, x12]\n"
+    "fmax v23.8h, v23.8h, v18.8h\n"
+    "str q25, [x20, x12]\n"
+    "str q24, [x19, x12]\n"
+    "fmin v23.8h, v23.8h, v17.8h\n"
+    "str q23, [x22, x12]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 48f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x12, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "ldr x10, [x16, #0x0]\n"
+    "add x10, x10, x14\n"
+    "ldr x9, [x16, #0x8]\n"
+    "ldr x28, [x16, #0x10]\n"
+    "add x9, x9, x14\n"
+    "ldr x27, [x16, #0x18]\n"
+    "ldr x26, [x16, #0x20]\n"
+    "add x28, x28, x14\n"
+    "add x27, x27, x14\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[0], [x10], #0x4\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "ld1 { v11.s }[0], [x28], #0x4\n"
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "ld1 { v13.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.h }[2], [x10], #0x2\n"
+    "ld1 { v10.h }[2], [x9], #0x2\n"
+    "ld1 { v11.h }[2], [x28], #0x2\n"
+    "ld1 { v12.h }[2], [x27], #0x2\n"
+    "ld1 { v13.h }[2], [x26], #0x2\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+    "ld1 { v9.h }[0], [x10], #0x2\n"
+    "ld1 { v10.h }[0], [x9], #0x2\n"
+    "ld1 { v11.h }[0], [x28], #0x2\n"
+    "ld1 { v12.h }[0], [x27], #0x2\n"
+    "ld1 { v13.h }[0], [x26], #0x2\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "add x25, x25, x14\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+    "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+    "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+    "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+    "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+    "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v25.8h, v6.8h, v12.8h\n"
+    "fmla v30.8h, v4.8h, v13.8h\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v13.8h\n"
+    "fmla v27.8h, v1.8h, v13.8h\n"
+    "fmla v26.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v12.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v12.h }[2], [x25], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v12.h }[0], [x25], #0x2\n"
+    "7:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v23.8h, v8.8h, v12.8h\n"
+    "ldr x24, [x16, #0x30]\n"
+    "add x24, x24, x14\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.h }[2], [x24], #0x2\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v11.h }[0], [x24], #0x2\n"
+    "9:"  // Oddments: Load input (2, 1): Bit 1: End
+    "fmla v31.8h, v7.8h, v11.8h\n"
+    "ldr x23, [x16, #0x38]\n"
+    "fmla v30.8h, v6.8h, v11.8h\n"
+    "add x23, x23, x14\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "fmla v27.8h, v3.8h, v11.8h\n"
+    "fmla v25.8h, v1.8h, v11.8h\n"
+    "fmla v24.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v13.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v13.h }[2], [x23], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v13.h }[0], [x23], #0x2\n"
+    "11:"  // Oddments: Load input (0, 1): Bit 1: End
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "ldr x10, [x16, #0x40]\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "add x10, x10, x14\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.s }[0], [x10], #0x4\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v12.h }[2], [x10], #0x2\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 3): Bit 1: Unset
+    "ld1 { v12.h }[0], [x10], #0x2\n"
+    "13:"  // Oddments: Load input (0, 3): Bit 1: End
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "ldr x9, [x16, #0x48]\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "add x9, x9, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v10.h }[2], [x9], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v10.h }[0], [x9], #0x2\n"
+    "15:"  // Oddments: Load input (2, 3): Bit 1: End
+    "fmla v30.8h, v8.8h, v10.8h\n"
+    "ldr x28, [x16, #0x50]\n"
+    "fmla v29.8h, v7.8h, v10.8h\n"
+    "add x28, x28, x14\n"
+    "fmla v27.8h, v5.8h, v10.8h\n"
+    "fmla v26.8h, v4.8h, v10.8h\n"
+    "fmla v24.8h, v2.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.s }[0], [x28], #0x4\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.h }[2], [x28], #0x2\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v11.h }[0], [x28], #0x2\n"
+    "17:"  // Oddments: Load input (1, 0): Bit 1: End
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr x27, [x16, #0x58]\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.s }[0], [x27], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[2], [x27], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v13.h }[0], [x27], #0x2\n"
+    "19:"  // Oddments: Load input (1, 4): Bit 1: End
+    "fmla v29.8h, v5.8h, v13.8h\n"
+    "ldr x26, [x16, #0x60]\n"
+    "fmla v26.8h, v2.8h, v13.8h\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v12.h }[2], [x26], #0x2\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v12.h }[0], [x26], #0x2\n"
+    "21:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v25.8h, v3.8h, v12.8h\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v10.h }[2], [x25], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v10.h }[0], [x25], #0x2\n"
+    "23:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v28.8h, v8.8h, v10.8h\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v27.8h, v7.8h, v10.8h\n"
+    "add x24, x24, x14\n"
+    "fmla v26.8h, v6.8h, v10.8h\n"
+    "fmla v25.8h, v5.8h, v10.8h\n"
+    "fmla v24.8h, v4.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v11.h }[2], [x24], #0x2\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v11.h }[0], [x24], #0x2\n"
+    "25:"  // Oddments: Load input (3, 4): Bit 1: End
+    "fmla v26.8h, v8.8h, v11.8h\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v23.8h, v5.8h, v11.8h\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v13.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v13.h }[2], [x23], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.h }[0], [x23], #0x2\n"
+    "27:"  // Oddments: Load input (4, 1): Bit 1: End
+    "fmla v25.8h, v7.8h, v13.8h\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v24.8h, v6.8h, v13.8h\n"
+    "add x10, x10, x14\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v12.s }[0], [x10], #0x4\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v12.h }[2], [x10], #0x2\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (1, 1): Bit 1: Unset
+    "ld1 { v12.h }[0], [x10], #0x2\n"
+    "29:"  // Oddments: Load input (1, 1): Bit 1: End
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "add x9, x9, x14\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v27.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[2], [x9], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v11.h }[0], [x9], #0x2\n"
+    "31:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v30.8h, v5.8h, v11.8h\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "add x28, x28, x14\n"
+    "fmla v27.8h, v2.8h, v11.8h\n"
+    "fmla v26.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v13.s }[0], [x28], #0x4\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v13.h }[2], [x28], #0x2\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v13.h }[0], [x28], #0x2\n"
+    "33:"  // Oddments: Load input (4, 3): Bit 1: End
+    "fmla v24.8h, v8.8h, v13.8h\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v23.8h, v7.8h, v13.8h\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.h }[2], [x27], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v12.h }[0], [x27], #0x2\n"
+    "35:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v27.8h, v6.8h, v12.8h\n"
+    "add x26, x26, x14\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "fmla v24.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v11.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v11.h }[2], [x26], #0x2\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v11.h }[0], [x26], #0x2\n"
+    "37:"  // Oddments: Load input (0, 2): Bit 1: End
+    "fmla v31.8h, v2.8h, v11.8h\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v30.8h, v1.8h, v11.8h\n"
+    "add x25, x25, x14\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v13.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.h }[2], [x25], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v13.h }[0], [x25], #0x2\n"
+    "39:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v27.8h, v8.8h, v13.8h\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v26.8h, v7.8h, v13.8h\n"
+    "add x24, x24, x14\n"
+    "fmla v24.8h, v5.8h, v13.8h\n"
+    "fmla v23.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v12.h }[2], [x24], #0x2\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v12.h }[0], [x24], #0x2\n"
+    "41:"  // Oddments: Load input (2, 0): Bit 1: End
+    "fmla v31.8h, v6.8h, v12.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "add x23, x23, x14\n"
+    "fmla v25.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v11.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.h }[2], [x23], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v11.h }[0], [x23], #0x2\n"
+    "43:"  // Oddments: Load input (2, 4): Bit 1: End
+    "fmla v29.8h, v8.8h, v11.8h\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v26.8h, v5.8h, v11.8h\n"
+    "add x10, x10, x14\n"
+    "fmla v23.8h, v2.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v13.s }[0], [x10], #0x4\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v13.h }[2], [x10], #0x2\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v13.h }[0], [x10], #0x2\n"
+    "45:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v25.8h, v8.8h, v13.8h\n"
+    "fmla v24.8h, v7.8h, v13.8h\n"
+    "fmla v23.8h, v6.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "fmax v27.8h, v27.8h, v18.8h\n"
+    "fmax v26.8h, v26.8h, v18.8h\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "fmin v27.8h, v27.8h, v17.8h\n"
+    "fmin v26.8h, v26.8h, v17.8h\n"
+    "fmax v25.8h, v25.8h, v18.8h\n"
+    "fmax v24.8h, v24.8h, v18.8h\n"
+    "fmax v23.8h, v23.8h, v18.8h\n"
+    "fmin v25.8h, v25.8h, v17.8h\n"
+    "fmin v24.8h, v24.8h, v17.8h\n"
+    "fmin v23.8h, v23.8h, v17.8h\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ldr x22, [x17, #0x0]\n"
+    "ldr x21, [x17, #0x8]\n"
+    "add x22, x22, x12\n"
+    "ldr x20, [x17, #0x10]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.s }[0], [x22]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.s }[0], [x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.s }[0], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.s }[0], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.s }[0], [x19]\n"
+    "add x12, x12, #0x4\n"
+    "st1 { v23.s }[0], [x22]\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ldr x22, [x17, #0x0]\n"
+    "ldr x21, [x17, #0x8]\n"
+    "add x22, x22, x12\n"
+    "ldr x20, [x17, #0x10]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.h }[2], [x22]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.h }[2], [x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.h }[2], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.h }[2], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.h }[2], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.h }[2], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.h }[2], [x19]\n"
+    "st1 { v23.h }[2], [x22]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Store: Bit 1: Unset
+    "ldr x22, [x17, #0x0]\n"
+    "add x22, x22, x12\n"
+    "ldr x21, [x17, #0x8]\n"
+    "ldr x20, [x17, #0x10]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.h }[0], [x22]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.h }[0], [x20]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.h }[0], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.h }[0], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.h }[0], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.h }[0], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.h }[0], [x19]\n"
+    "st1 { v23.h }[0], [x22]\n"
+    "47:"  // Oddments: Store: Bit 1: End
+
+    "48:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000..df53287
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..bf18469
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,1233 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x4, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x24, #0x4\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x23, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x7, #0x0\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x4, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x26, x6, x19\n" // offset += tile_j * ld_input_col
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+    "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x8, x8, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1r { v15.8h }, [x23]\n"
+    "add x15, x8, x22, LSL #1\n"
+    "ld1r { v14.8h }, [x21]\n"
+    "add x14, x15, x22, LSL #1\n"
+    "lsl x6, x6, #0x1\n"
+    "add x13, x14, x22, LSL #1\n"
+    "add x12, x13, x22, LSL #1\n"
+    "add x11, x12, x22, LSL #1\n"
+    "add x10, x6, x6\n"
+    "add x9, x10, x6\n"
+    "add x28, x9, x6\n"
+    "add x27, x28, x6\n"
+    "mul x19, x4, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x26, x17, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x24\n" // offset *= output_tile_size
+    "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "add x26, x16, x20, LSL #1\n"
+    "add x25, x26, x20, LSL #1\n"
+    "add x24, x25, x20, LSL #1\n"
+    "lsl x17, x17, #0x1\n"
+    "add x23, x17, x17\n"
+    "add x22, x23, x17\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x3\n"
+    "cbz x19, 4f\n"
+    "ldr q13, [x5, #0x0]\n"
+    "ldr q0, [x5, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x5, #0x20]\n"
+    "ldr q2, [x5, #0x30]\n"
+    "ldr q3, [x5, #0x40]\n"
+    "ldr q4, [x5, #0x50]\n"
+    "ldr q5, [x5, #0x60]\n"
+    "ldr q6, [x5, #0x70]\n"
+    "ldr q7, [x5, #0x80]\n"
+    "ldr q8, [x5, #0x90]\n"
+    "add x5, x5, #0xa0\n"
+    "ldr q9, [x14, x10]\n"
+    "ld1 { v10.8h }, [x8]\n"
+    "ldr q11, [x8, x27]\n"
+    "ldr q12, [x14, x9]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "add x7, x7, #0x10\n"
+    "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "add x21, x21, #0x10\n"
+    "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+    "cmp x21, x19, LSL #4\n"
+    "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+    "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+    "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x13, x10]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x11]\n"
+    "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x11, x27]\n"
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x8, x6]\n"
+    "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x13, x9]\n"
+    "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x8, x28]\n"
+    "fmla v27.8h, v8.8h, v9.8h\n"
+    "fmla v26.8h, v7.8h, v9.8h\n"
+    "fmla v25.8h, v6.8h, v9.8h\n"
+    "fmla v23.8h, v5.8h, v9.8h\n"
+    "fmla v22.8h, v4.8h, v9.8h\n"
+    "fmla v21.8h, v3.8h, v9.8h\n"
+    "fmla v19.8h, v2.8h, v9.8h\n"
+    "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+    "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x15]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "ldr q13, [x5, #0x0]\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x15, x27]\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "ld1 { v11.8h }, [x12]\n"
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "fmla v24.8h, v6.8h, v10.8h\n"
+    "fmla v22.8h, v5.8h, v10.8h\n"
+    "fmla v21.8h, v4.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "fmla v18.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x15, x10]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x15, x9]\n"
+    "fmla v23.8h, v6.8h, v11.8h\n"
+    "fmla v19.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x12, x27]\n"
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v2.8h, v10.8h\n"
+    "fmla v26.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x14, x6]\n"
+    "fmla v20.8h, v8.8h, v11.8h\n"
+    "fmla v16.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x11, x6]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "fmla v24.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x14, x28]\n"
+    "fmla v19.8h, v7.8h, v11.8h\n"
+    "fmla v18.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x11, x28]\n"
+    "fmla v31.8h, v7.8h, v10.8h\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "fmla v27.8h, v4.8h, v10.8h\n"
+    "fmla v26.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x8, x10]\n"
+    "fmla v17.8h, v8.8h, v11.8h\n"
+    "fmla v16.8h, v7.8h, v11.8h\n"
+    "ldr q11, [x13, x6]\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v4.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v20.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x8, x9]\n"
+    "add x8, x8, #0x10\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x14]\n"
+    "fmla v27.8h, v7.8h, v11.8h\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v4.8h, v11.8h\n"
+    "fmla v22.8h, v3.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "fmla v18.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x13, x28]\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x14, x27]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.8h, v6.8h, v10.8h\n"
+    "ldr q9, [x14, x10]\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x13]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v24.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v5.8h, v11.8h\n"
+    "fmla v20.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x12, x10]\n"
+    "fmla v28.8h, v8.8h, v12.8h\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v20.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x13, x27]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v27.8h, v6.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v19.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x11, x10]\n"
+    "fmla v22.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v8.8h, v11.8h\n"
+    "fmla v19.8h, v5.8h, v11.8h\n"
+    "fmla v18.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x12, x9]\n"
+    "fmla v24.8h, v8.8h, v12.8h\n"
+    "fmla v20.8h, v5.8h, v12.8h\n"
+    "fmla v16.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x11, x9]\n"
+    "add x11, x11, #0x10\n"
+    "fmla v19.8h, v8.8h, v10.8h\n"
+    "fmla v18.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x15, x6]\n"
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v21.8h, v7.8h, v11.8h\n"
+    "fmla v20.8h, v6.8h, v11.8h\n"
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v17.8h, v4.8h, v11.8h\n"
+    "fmla v16.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x15, x28]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v17.8h, v7.8h, v12.8h\n"
+    "fmla v16.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x12, x6]\n"
+    "fmla v30.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v1.8h, v10.8h\n"
+    "fmla v26.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x12, x28]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "ldr q0, [x5, #0x10]\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x5, #0x30]\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x8, x27]\n"
+    "fmla v23.8h, v7.8h, v12.8h\n"
+    "ldr q1, [x5, #0x20]\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "ldr q6, [x5, #0x70]\n"
+    "fmla v19.8h, v4.8h, v12.8h\n"
+    "fmla v18.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x14, x9]\n"
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "ldr q3, [x5, #0x40]\n"
+    "fmla v20.8h, v7.8h, v10.8h\n"
+    "ldr q7, [x5, #0x80]\n"
+    "fmla v17.8h, v5.8h, v10.8h\n"
+    "ldr q5, [x5, #0x60]\n"
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x8]\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "ldr q4, [x5, #0x50]\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "ldr q8, [x5, #0x90]\n"
+    "add x5, x5, #0xa0\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "st1 { v31.8h }, [x16]\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "str q30, [x16, x17]\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "str q29, [x16, x23]\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "str q28, [x16, x22]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "st1 { v27.8h }, [x26]\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "str q26, [x26, x17]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "str q25, [x26, x23]\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "str q24, [x26, x22]\n"
+    "add x26, x26, #0x10\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "st1 { v23.8h }, [x25]\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q22, [x25, x17]\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "fmax v19.8h, v19.8h, v15.8h\n"
+    "str q21, [x25, x23]\n"
+    "fmax v18.8h, v18.8h, v15.8h\n"
+    "str q20, [x25, x22]\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "add x25, x25, #0x10\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "st1 { v19.8h }, [x24]\n"
+    "fmax v17.8h, v17.8h, v15.8h\n"
+    "fmax v16.8h, v16.8h, v15.8h\n"
+    "str q18, [x24, x17]\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "str q17, [x24, x23]\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "str q16, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+    "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+    "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+    "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x13, x10]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x11]\n"
+    "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x11, x27]\n"
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x8, x6]\n"
+    "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x13, x9]\n"
+    "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x8, x28]\n"
+    "fmla v27.8h, v8.8h, v9.8h\n"
+    "fmla v26.8h, v7.8h, v9.8h\n"
+    "fmla v25.8h, v6.8h, v9.8h\n"
+    "fmla v23.8h, v5.8h, v9.8h\n"
+    "fmla v22.8h, v4.8h, v9.8h\n"
+    "fmla v21.8h, v3.8h, v9.8h\n"
+    "fmla v19.8h, v2.8h, v9.8h\n"
+    "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+    "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x15]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x15, x27]\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "ld1 { v11.8h }, [x12]\n"
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "fmla v24.8h, v6.8h, v10.8h\n"
+    "fmla v22.8h, v5.8h, v10.8h\n"
+    "fmla v21.8h, v4.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "fmla v18.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x15, x10]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x15, x9]\n"
+    "fmla v23.8h, v6.8h, v11.8h\n"
+    "fmla v19.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x12, x27]\n"
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v2.8h, v10.8h\n"
+    "fmla v26.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x14, x6]\n"
+    "fmla v20.8h, v8.8h, v11.8h\n"
+    "fmla v16.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x11, x6]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "fmla v24.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x14, x28]\n"
+    "fmla v19.8h, v7.8h, v11.8h\n"
+    "fmla v18.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x11, x28]\n"
+    "fmla v31.8h, v7.8h, v10.8h\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "fmla v27.8h, v4.8h, v10.8h\n"
+    "fmla v26.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x8, x10]\n"
+    "fmla v17.8h, v8.8h, v11.8h\n"
+    "fmla v16.8h, v7.8h, v11.8h\n"
+    "ldr q11, [x13, x6]\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v4.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v20.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x8, x9]\n"
+    "add x8, x8, #0x10\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x14]\n"
+    "fmla v27.8h, v7.8h, v11.8h\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v4.8h, v11.8h\n"
+    "fmla v22.8h, v3.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "fmla v18.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x13, x28]\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x14, x27]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.8h, v6.8h, v10.8h\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x13]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "fmla v24.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v5.8h, v11.8h\n"
+    "fmla v20.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x12, x10]\n"
+    "fmla v28.8h, v8.8h, v12.8h\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v20.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x13, x27]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v27.8h, v6.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v19.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x11, x10]\n"
+    "fmla v22.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v8.8h, v11.8h\n"
+    "fmla v19.8h, v5.8h, v11.8h\n"
+    "fmla v18.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x12, x9]\n"
+    "fmla v24.8h, v8.8h, v12.8h\n"
+    "fmla v20.8h, v5.8h, v12.8h\n"
+    "fmla v16.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x11, x9]\n"
+    "add x11, x11, #0x10\n"
+    "fmla v19.8h, v8.8h, v10.8h\n"
+    "fmla v18.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x15, x6]\n"
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v21.8h, v7.8h, v11.8h\n"
+    "fmla v20.8h, v6.8h, v11.8h\n"
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v17.8h, v4.8h, v11.8h\n"
+    "fmla v16.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x15, x28]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v17.8h, v7.8h, v12.8h\n"
+    "fmla v16.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x12, x6]\n"
+    "fmla v30.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v1.8h, v10.8h\n"
+    "fmla v26.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x12, x28]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "fmla v23.8h, v7.8h, v12.8h\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "fmla v19.8h, v4.8h, v12.8h\n"
+    "fmla v18.8h, v3.8h, v12.8h\n"
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "fmla v20.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v5.8h, v10.8h\n"
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "st1 { v31.8h }, [x16]\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "str q30, [x16, x17]\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "str q29, [x16, x23]\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "str q28, [x16, x22]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "add x16, x16, #0x10\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "st1 { v27.8h }, [x26]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "str q26, [x26, x17]\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "str q25, [x26, x23]\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "str q24, [x26, x22]\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "add x26, x26, #0x10\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "st1 { v23.8h }, [x25]\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "str q22, [x25, x17]\n"
+    "fmax v19.8h, v19.8h, v15.8h\n"
+    "fmax v18.8h, v18.8h, v15.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q21, [x25, x23]\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "str q20, [x25, x22]\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "add x25, x25, #0x10\n"
+    "fmax v17.8h, v17.8h, v15.8h\n"
+    "st1 { v19.8h }, [x24]\n"
+    "fmax v16.8h, v16.8h, v15.8h\n"
+    "str q18, [x24, x17]\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "str q17, [x24, x23]\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "str q16, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 73f\n"
+    "ldr q13, [x5, #0x0]\n"
+    "ldr q0, [x5, #0x10]\n"
+    "add x22, x14, x10\n"
+    "ldr q1, [x5, #0x20]\n"
+    "add x21, x8, XZR\n"
+    "ldr q2, [x5, #0x30]\n"
+    "add x20, x8, x27\n"
+    "ldr q3, [x5, #0x40]\n"
+    "add x19, x14, x9\n"
+    "ldr q4, [x5, #0x50]\n"
+    "ldr q5, [x5, #0x60]\n"
+    "ldr q6, [x5, #0x70]\n"
+    "ldr q7, [x5, #0x80]\n"
+    "ldr q8, [x5, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr s9, [x22], #0x4\n"
+    "ldr s10, [x21], #0x4\n"
+    "ldr s11, [x20], #0x4\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.h }[2], [x22]\n"
+    "ld1 { v10.h }[2], [x21]\n"
+    "ld1 { v11.h }[2], [x20]\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+    "ldr h9, [x22, #0x0]\n"
+    "ldr h10, [x21, #0x0]\n"
+    "ldr h11, [x20, #0x0]\n"
+    "ldr h12, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+    "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "add x19, x11, XZR\n"
+    "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+    "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+    "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+    "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+    "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+    "add x19, x11, x27\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+    "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+    "add x19, x13, x10\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v27.8h, v8.8h, v9.8h\n"
+    "add x19, x8, x6\n"
+    "fmla v26.8h, v7.8h, v9.8h\n"
+    "fmla v25.8h, v6.8h, v9.8h\n"
+    "fmla v23.8h, v5.8h, v9.8h\n"
+    "fmla v22.8h, v4.8h, v9.8h\n"
+    "fmla v21.8h, v3.8h, v9.8h\n"
+    "fmla v19.8h, v2.8h, v9.8h\n"
+    "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+    "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "add x19, x8, x28\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "add x19, x13, x9\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "add x19, x15, XZR\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "fmla v24.8h, v6.8h, v10.8h\n"
+    "fmla v22.8h, v5.8h, v10.8h\n"
+    "fmla v21.8h, v4.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "fmla v18.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "add x19, x15, x27\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "add x19, x12, XZR\n"
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v23.8h, v6.8h, v11.8h\n"
+    "add x19, x15, x10\n"
+    "fmla v19.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "add x19, x12, x27\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v2.8h, v10.8h\n"
+    "fmla v26.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+    "fmla v20.8h, v8.8h, v11.8h\n"
+    "add x19, x15, x9\n"
+    "fmla v16.8h, v5.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "add x19, x11, x6\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "fmla v24.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+    "fmla v19.8h, v7.8h, v11.8h\n"
+    "add x19, x14, x6\n"
+    "fmla v18.8h, v6.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v31.8h, v7.8h, v10.8h\n"
+    "add x19, x11, x28\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "fmla v27.8h, v4.8h, v10.8h\n"
+    "fmla v26.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+    "fmla v17.8h, v8.8h, v11.8h\n"
+    "add x19, x14, x28\n"
+    "fmla v16.8h, v7.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "add x19, x8, x10\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v4.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v20.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "add x19, x13, x6\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v27.8h, v7.8h, v11.8h\n"
+    "add x19, x8, x9\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v4.8h, v11.8h\n"
+    "fmla v22.8h, v3.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "fmla v18.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "add x19, x14, XZR\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v31.8h, v6.8h, v10.8h\n"
+    "add x19, x13, x28\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "add x19, x14, x27\n"
+    "fmla v24.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v5.8h, v11.8h\n"
+    "fmla v20.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 50f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 50f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+    "fmla v28.8h, v8.8h, v12.8h\n"
+    "add x19, x13, XZR\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v20.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v27.8h, v6.8h, v10.8h\n"
+    "add x19, x12, x10\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v19.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 54f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 54f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v23.8h, v8.8h, v11.8h\n"
+    "add x19, x13, x27\n"
+    "fmla v22.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "fmla v19.8h, v5.8h, v11.8h\n"
+    "fmla v18.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+    "fmla v24.8h, v8.8h, v12.8h\n"
+    "add x19, x11, x10\n"
+    "fmla v20.8h, v5.8h, v12.8h\n"
+    "fmla v16.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 58f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 58f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+    "fmla v19.8h, v8.8h, v10.8h\n"
+    "add x19, x12, x9\n"
+    "fmla v18.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "add x19, x11, x9\n"
+    "fmla v21.8h, v7.8h, v11.8h\n"
+    "fmla v20.8h, v6.8h, v11.8h\n"
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v17.8h, v4.8h, v11.8h\n"
+    "fmla v16.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 62f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 62f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "add x19, x15, x6\n"
+    "fmla v17.8h, v7.8h, v12.8h\n"
+    "fmla v16.8h, v6.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "add x19, x15, x28\n"
+    "fmla v30.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v1.8h, v10.8h\n"
+    "fmla v26.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 66f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 66f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "add x19, x12, x6\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v23.8h, v7.8h, v12.8h\n"
+    "add x19, x12, x28\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "fmla v19.8h, v4.8h, v12.8h\n"
+    "fmla v18.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 70f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 70f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "fmla v20.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v5.8h, v10.8h\n"
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "fmax v19.8h, v19.8h, v15.8h\n"
+    "fmax v18.8h, v18.8h, v15.8h\n"
+    "fmax v17.8h, v17.8h, v15.8h\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "fmax v16.8h, v16.8h, v15.8h\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "tbz %x[n_channels], #1, 71f\n"
+    "mov x19, x16\n"
+    "st1 { v31.s }[0], [x19], x17\n"
+    "add x16, x16, #0x4\n"
+    "st1 { v30.s }[0], [x19], x17\n"
+    "mov x21, x26\n"
+    "st1 { v29.s }[0], [x19], x17\n"
+    "st1 { v27.s }[0], [x21], x17\n"
+    "add x26, x26, #0x4\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "mov x20, x25\n"
+    "st1 { v26.s }[0], [x21], x17\n"
+    "add x25, x25, #0x4\n"
+    "st1 { v25.s }[0], [x21], x17\n"
+    "mov x19, x24\n"
+    "st1 { v24.s }[0], [x21]\n"
+    "add x24, x24, #0x4\n"
+    "st1 { v23.s }[0], [x20], x17\n"
+    "st1 { v22.s }[0], [x20], x17\n"
+    "st1 { v21.s }[0], [x20], x17\n"
+    "st1 { v20.s }[0], [x20]\n"
+    "st1 { v19.s }[0], [x19], x17\n"
+    "st1 { v18.s }[0], [x19], x17\n"
+    "st1 { v17.s }[0], [x19], x17\n"
+    "st1 { v16.s }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "mov x22, x16\n"
+    "st1 { v31.h }[2], [x22], x17\n"
+    "mov x21, x26\n"
+    "st1 { v30.h }[2], [x22], x17\n"
+    "st1 { v27.h }[2], [x21], x17\n"
+    "mov x20, x25\n"
+    "st1 { v29.h }[2], [x22], x17\n"
+    "mov x19, x24\n"
+    "st1 { v28.h }[2], [x22]\n"
+    "st1 { v26.h }[2], [x21], x17\n"
+    "st1 { v25.h }[2], [x21], x17\n"
+    "st1 { v24.h }[2], [x21]\n"
+    "st1 { v23.h }[2], [x20], x17\n"
+    "st1 { v22.h }[2], [x20], x17\n"
+    "st1 { v21.h }[2], [x20], x17\n"
+    "st1 { v20.h }[2], [x20]\n"
+    "st1 { v19.h }[2], [x19], x17\n"
+    "st1 { v18.h }[2], [x19], x17\n"
+    "st1 { v17.h }[2], [x19], x17\n"
+    "st1 { v16.h }[2], [x19]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x22, x16\n"
+    "st1 { v31.h }[0], [x22], x17\n"
+    "mov x21, x26\n"
+    "mov x20, x25\n"
+    "st1 { v30.h }[0], [x22], x17\n"
+    "st1 { v27.h }[0], [x21], x17\n"
+    "mov x19, x24\n"
+    "st1 { v29.h }[0], [x22], x17\n"
+    "st1 { v28.h }[0], [x22]\n"
+    "st1 { v26.h }[0], [x21], x17\n"
+    "st1 { v25.h }[0], [x21], x17\n"
+    "st1 { v24.h }[0], [x21]\n"
+    "st1 { v23.h }[0], [x20], x17\n"
+    "st1 { v22.h }[0], [x20], x17\n"
+    "st1 { v21.h }[0], [x20], x17\n"
+    "st1 { v20.h }[0], [x20]\n"
+    "st1 { v19.h }[0], [x19], x17\n"
+    "st1 { v18.h }[0], [x19], x17\n"
+    "st1 { v17.h }[0], [x19], x17\n"
+    "st1 { v16.h }[0], [x19]\n"
+    "72:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "73:"  // Tile loop: End
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x4, #0x1\n"
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x26, x26, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x19\n"
+    "csel x26, x26, XZR, LT\n"
+    "csel x4, x4, x21, LT\n"
+    "cmp x4, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..40c019a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.8h }, [x20]\n"
+    "ld1r { v14.8h }, [x19]\n"
+    "mov x14, #0x0\n"
+    "mov x13, #0x10\n" // cntb _, ALL, #1
+    "sub x12, XZR, x13\n"
+    "lsr x11, %x[n_channels], #0x3\n"
+    "cbz x11, 3f\n"
+    "ldr q13, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x13, x11, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldp x10, x9, [x16, #0x0]\n"
+    "ldp x28, x27, [x16, #0x10]\n"
+    "ldr q9, [x10, x14]\n"
+    "ldr q10, [x9, x14]\n"
+    "ldr q11, [x28, x14]\n"
+    "ldr q12, [x27, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x26, [x16, #0x20]\n"
+    "add x12, x12, #0x10\n"
+    "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+    "ldr x23, [x16, #0x38]\n"
+    "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+    "ldr x10, [x16, #0x40]\n"
+    "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+    "ldr x9, [x16, #0x48]\n"
+    "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "ldr x28, [x16, #0x50]\n"
+    "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "ldr x27, [x16, #0x58]\n"
+    "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x24, x14]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x26, x14]\n"
+    "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "ldr x26, [x16, #0x60]\n"
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "ldr x21, [x17, #0x8]\n"
+    "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x23, x14]\n"
+    "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x9, x14]\n"
+    "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v27.8h, v8.8h, v9.8h\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v26.8h, v7.8h, v9.8h\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v25.8h, v6.8h, v9.8h\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v23.8h, v5.8h, v9.8h\n"
+    "fmla v22.8h, v4.8h, v9.8h\n"
+    "fmla v21.8h, v3.8h, v9.8h\n"
+    "fmla v19.8h, v2.8h, v9.8h\n"
+    "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+    "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x28, x14]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x26, x14]\n"
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "ldr q13, [x15, #0x0]\n"
+    "fmla v24.8h, v6.8h, v10.8h\n"
+    "fmla v22.8h, v5.8h, v10.8h\n"
+    "fmla v21.8h, v4.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "fmla v18.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v23.8h, v6.8h, v11.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v19.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v2.8h, v10.8h\n"
+    "fmla v26.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x9, x14]\n"
+    "fmla v20.8h, v8.8h, v11.8h\n"
+    "ldr x9, [x16, #0xc8]\n"
+    "fmla v16.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "fmla v24.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v19.8h, v7.8h, v11.8h\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla v18.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x28, x14]\n"
+    "fmla v31.8h, v7.8h, v10.8h\n"
+    "ldr x28, [x16, #0xd0]\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "fmla v27.8h, v4.8h, v10.8h\n"
+    "fmla v26.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x26, x14]\n"
+    "fmla v17.8h, v8.8h, v11.8h\n"
+    "ldr x26, [x16, #0xe0]\n"
+    "fmla v16.8h, v7.8h, v11.8h\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v4.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v20.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x23, x14]\n"
+    "fmla v27.8h, v7.8h, v11.8h\n"
+    "ldr x23, [x16, #0xf8]\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v4.8h, v11.8h\n"
+    "fmla v22.8h, v3.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "fmla v18.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "ldr x10, [x16, #0x100]\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x9, x14]\n"
+    "fmla v31.8h, v6.8h, v10.8h\n"
+    "ldr x9, [x16, #0x108]\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x28, x14]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "ldr x28, [x16, #0x110]\n"
+    "fmla v24.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v5.8h, v11.8h\n"
+    "fmla v20.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x27, x14]\n"
+    "fmla v28.8h, v8.8h, v12.8h\n"
+    "ldr x27, [x16, #0x118]\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v20.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x26, x14]\n"
+    "fmla v27.8h, v6.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v19.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v22.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v8.8h, v11.8h\n"
+    "fmla v19.8h, v5.8h, v11.8h\n"
+    "fmla v18.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v24.8h, v8.8h, v12.8h\n"
+    "fmla v20.8h, v5.8h, v12.8h\n"
+    "fmla v16.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v19.8h, v8.8h, v10.8h\n"
+    "fmla v18.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x10, x14]\n"
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v21.8h, v7.8h, v11.8h\n"
+    "fmla v20.8h, v6.8h, v11.8h\n"
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v17.8h, v4.8h, v11.8h\n"
+    "fmla v16.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x9, x14]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "ldp x10, x9, [x16, #0x0]\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "ldr q9, [x10, x13]\n"
+    "fmla v17.8h, v7.8h, v12.8h\n"
+    "fmla v16.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x28, x14]\n"
+    "fmla v30.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v1.8h, v10.8h\n"
+    "fmla v26.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x27, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "ldp x28, x27, [x16, #0x10]\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x28, x13]\n"
+    "fmla v23.8h, v7.8h, v12.8h\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmla v19.8h, v4.8h, v12.8h\n"
+    "fmla v18.8h, v3.8h, v12.8h\n"
+    "ldr q12, [x27, x13]\n"
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v20.8h, v7.8h, v10.8h\n"
+    "ldr q7, [x15, #0x80]\n"
+    "fmla v17.8h, v5.8h, v10.8h\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "ldr q10, [x9, x13]\n"
+    "add x13, x13, #0x10\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "ldr q4, [x15, #0x50]\n"
+    "cmp x13, x11, LSL #4\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "str q31, [x22, x12]\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "str q30, [x21, x12]\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "str q29, [x20, x12]\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "str q28, [x19, x12]\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "str q27, [x22, x12]\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "str q26, [x21, x12]\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "str q25, [x20, x12]\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "str q24, [x19, x12]\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "ldr x20, [x17, #0x50]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "str q23, [x22, x12]\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "str q22, [x21, x12]\n"
+    "fmax v19.8h, v19.8h, v15.8h\n"
+    "str q21, [x20, x12]\n"
+    "fmax v18.8h, v18.8h, v15.8h\n"
+    "ldr x19, [x17, #0x58]\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "ldr x22, [x17, #0x60]\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "ldr x21, [x17, #0x68]\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "str q20, [x19, x12]\n"
+    "fmax v17.8h, v17.8h, v15.8h\n"
+    "str q19, [x22, x12]\n"
+    "fmax v16.8h, v16.8h, v15.8h\n"
+    "str q18, [x21, x12]\n"
+    "ldr x20, [x17, #0x70]\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "ldr x19, [x17, #0x78]\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "str q17, [x20, x12]\n"
+    "str q16, [x19, x12]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x26, [x16, #0x20]\n"
+    "add x12, x12, #0x10\n"
+    "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+    "ldr x23, [x16, #0x38]\n"
+    "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+    "ldr x10, [x16, #0x40]\n"
+    "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+    "ldr x9, [x16, #0x48]\n"
+    "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "ldr x28, [x16, #0x50]\n"
+    "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "ldr x27, [x16, #0x58]\n"
+    "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x24, x14]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x26, x14]\n"
+    "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "ldr x26, [x16, #0x60]\n"
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "ldr x21, [x17, #0x8]\n"
+    "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x23, x14]\n"
+    "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x9, x14]\n"
+    "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v27.8h, v8.8h, v9.8h\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v26.8h, v7.8h, v9.8h\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v25.8h, v6.8h, v9.8h\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v23.8h, v5.8h, v9.8h\n"
+    "fmla v22.8h, v4.8h, v9.8h\n"
+    "fmla v21.8h, v3.8h, v9.8h\n"
+    "fmla v19.8h, v2.8h, v9.8h\n"
+    "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+    "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x28, x14]\n"
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x26, x14]\n"
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "fmla v24.8h, v6.8h, v10.8h\n"
+    "fmla v22.8h, v5.8h, v10.8h\n"
+    "fmla v21.8h, v4.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "fmla v18.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v23.8h, v6.8h, v11.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v19.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v2.8h, v10.8h\n"
+    "fmla v26.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x9, x14]\n"
+    "fmla v20.8h, v8.8h, v11.8h\n"
+    "ldr x9, [x16, #0xc8]\n"
+    "fmla v16.8h, v5.8h, v11.8h\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "fmla v24.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v19.8h, v7.8h, v11.8h\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla v18.8h, v6.8h, v11.8h\n"
+    "ldr q11, [x28, x14]\n"
+    "fmla v31.8h, v7.8h, v10.8h\n"
+    "ldr x28, [x16, #0xd0]\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "fmla v27.8h, v4.8h, v10.8h\n"
+    "fmla v26.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x26, x14]\n"
+    "fmla v17.8h, v8.8h, v11.8h\n"
+    "ldr x26, [x16, #0xe0]\n"
+    "fmla v16.8h, v7.8h, v11.8h\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "fmla v25.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v4.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v20.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x23, x14]\n"
+    "fmla v27.8h, v7.8h, v11.8h\n"
+    "ldr x23, [x16, #0xf8]\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v4.8h, v11.8h\n"
+    "fmla v22.8h, v3.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "fmla v18.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "ldr x10, [x16, #0x100]\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "ldr q12, [x9, x14]\n"
+    "fmla v31.8h, v6.8h, v10.8h\n"
+    "ldr x9, [x16, #0x108]\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x28, x14]\n"
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "ldr x28, [x16, #0x110]\n"
+    "fmla v24.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v5.8h, v11.8h\n"
+    "fmla v20.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x27, x14]\n"
+    "fmla v28.8h, v8.8h, v12.8h\n"
+    "ldr x27, [x16, #0x118]\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "fmla v20.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x26, x14]\n"
+    "fmla v27.8h, v6.8h, v10.8h\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "fmla v19.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v22.8h, v7.8h, v11.8h\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "fmla v23.8h, v8.8h, v11.8h\n"
+    "fmla v19.8h, v5.8h, v11.8h\n"
+    "fmla v18.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v24.8h, v8.8h, v12.8h\n"
+    "fmla v20.8h, v5.8h, v12.8h\n"
+    "fmla v16.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v19.8h, v8.8h, v10.8h\n"
+    "fmla v18.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "ldr q10, [x10, x14]\n"
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "fmla v21.8h, v7.8h, v11.8h\n"
+    "fmla v20.8h, v6.8h, v11.8h\n"
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v17.8h, v4.8h, v11.8h\n"
+    "fmla v16.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x9, x14]\n"
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "fmla v17.8h, v7.8h, v12.8h\n"
+    "fmla v16.8h, v6.8h, v12.8h\n"
+    "ldr q12, [x28, x14]\n"
+    "fmla v30.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v1.8h, v10.8h\n"
+    "fmla v26.8h, v0.8h, v10.8h\n"
+    "ldr q10, [x27, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "fmla v23.8h, v7.8h, v12.8h\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "fmla v19.8h, v4.8h, v12.8h\n"
+    "fmla v18.8h, v3.8h, v12.8h\n"
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "fmla v20.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v5.8h, v10.8h\n"
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "str q31, [x22, x12]\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "str q30, [x21, x12]\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "str q29, [x20, x12]\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "str q28, [x19, x12]\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "str q27, [x22, x12]\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "str q26, [x21, x12]\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "str q25, [x20, x12]\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "ldr x20, [x17, #0x50]\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "str q24, [x19, x12]\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "str q23, [x22, x12]\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "str q22, [x21, x12]\n"
+    "fmax v19.8h, v19.8h, v15.8h\n"
+    "ldr x19, [x17, #0x58]\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "ldr x22, [x17, #0x60]\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "ldr x21, [x17, #0x68]\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "str q21, [x20, x12]\n"
+    "fmax v18.8h, v18.8h, v15.8h\n"
+    "str q20, [x19, x12]\n"
+    "fmax v17.8h, v17.8h, v15.8h\n"
+    "str q19, [x22, x12]\n"
+    "fmax v16.8h, v16.8h, v15.8h\n"
+    "ldr x20, [x17, #0x70]\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "ldr x19, [x17, #0x78]\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "str q18, [x21, x12]\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "str q17, [x20, x12]\n"
+    "str q16, [x19, x12]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 72f\n"
+    "ldr q13, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x12, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "ldr x10, [x16, #0x0]\n"
+    "add x10, x10, x14\n"
+    "ldr x9, [x16, #0x8]\n"
+    "ldr x28, [x16, #0x10]\n"
+    "add x9, x9, x14\n"
+    "ldr x27, [x16, #0x18]\n"
+    "add x28, x28, x14\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[0], [x10], #0x4\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "ld1 { v11.s }[0], [x28], #0x4\n"
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.h }[2], [x10], #0x2\n"
+    "ld1 { v10.h }[2], [x9], #0x2\n"
+    "ld1 { v11.h }[2], [x28], #0x2\n"
+    "ld1 { v12.h }[2], [x27], #0x2\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+    "ld1 { v9.h }[0], [x10], #0x2\n"
+    "ld1 { v10.h }[0], [x9], #0x2\n"
+    "ld1 { v11.h }[0], [x28], #0x2\n"
+    "ld1 { v12.h }[0], [x27], #0x2\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+    "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x26, [x16, #0x20]\n"
+    "add x26, x26, x14\n"
+    "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+    "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+    "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+    "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+    "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+    "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+    "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+    "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+    "fmla v30.8h, v8.8h, v12.8h\n"
+    "fmla v29.8h, v7.8h, v12.8h\n"
+    "fmla v26.8h, v5.8h, v12.8h\n"
+    "fmla v28.8h, v6.8h, v12.8h\n"
+    "fmla v25.8h, v4.8h, v12.8h\n"
+    "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+    "fmla v22.8h, v2.8h, v12.8h\n"
+    "fmla v21.8h, v1.8h, v12.8h\n"
+    "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v10.h }[2], [x26], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (5, 0): Bit 1: Unset
+    "ld1 { v10.h }[0], [x26], #0x2\n"
+    "7:"  // Oddments: Load input (5, 0): Bit 1: End
+    "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+    "ldr x25, [x16, #0x28]\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.h }[2], [x25], #0x2\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (5, 5): Bit 1: Unset
+    "ld1 { v11.h }[0], [x25], #0x2\n"
+    "9:"  // Oddments: Load input (5, 5): Bit 1: End
+    "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+    "ldr x24, [x16, #0x30]\n"
+    "add x24, x24, x14\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.h }[2], [x24], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v9.h }[0], [x24], #0x2\n"
+    "11:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v27.8h, v8.8h, v9.8h\n"
+    "ldr x23, [x16, #0x38]\n"
+    "fmla v26.8h, v7.8h, v9.8h\n"
+    "add x23, x23, x14\n"
+    "fmla v25.8h, v6.8h, v9.8h\n"
+    "fmla v23.8h, v5.8h, v9.8h\n"
+    "fmla v22.8h, v4.8h, v9.8h\n"
+    "fmla v21.8h, v3.8h, v9.8h\n"
+    "fmla v19.8h, v2.8h, v9.8h\n"
+    "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+    "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "13:"  // Oddments: Load input (0, 1): Bit 1: End
+    "fmla v31.8h, v1.8h, v12.8h\n"
+    "ldr x10, [x16, #0x40]\n"
+    "fmla v30.8h, v0.8h, v12.8h\n"
+    "add x10, x10, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.s }[0], [x10], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.h }[2], [x10], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (0, 4): Bit 1: Unset
+    "ld1 { v11.h }[0], [x10], #0x2\n"
+    "15:"  // Oddments: Load input (0, 4): Bit 1: End
+    "fmla v29.8h, v2.8h, v11.8h\n"
+    "ldr x9, [x16, #0x48]\n"
+    "fmla v28.8h, v1.8h, v11.8h\n"
+    "add x9, x9, x14\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v10.h }[2], [x9], #0x2\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v10.h }[0], [x9], #0x2\n"
+    "17:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v26.8h, v8.8h, v10.8h\n"
+    "ldr x28, [x16, #0x50]\n"
+    "fmla v25.8h, v7.8h, v10.8h\n"
+    "add x28, x28, x14\n"
+    "fmla v24.8h, v6.8h, v10.8h\n"
+    "fmla v22.8h, v5.8h, v10.8h\n"
+    "fmla v21.8h, v4.8h, v10.8h\n"
+    "fmla v20.8h, v3.8h, v10.8h\n"
+    "fmla v18.8h, v2.8h, v10.8h\n"
+    "fmla v17.8h, v1.8h, v10.8h\n"
+    "fmla v16.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v9.s }[0], [x28], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.h }[2], [x28], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v9.h }[0], [x28], #0x2\n"
+    "19:"  // Oddments: Load input (1, 0): Bit 1: End
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "ldr x27, [x16, #0x58]\n"
+    "fmla v27.8h, v0.8h, v9.8h\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v12.h }[2], [x27], #0x2\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (1, 5): Bit 1: Unset
+    "ld1 { v12.h }[0], [x27], #0x2\n"
+    "21:"  // Oddments: Load input (1, 5): Bit 1: End
+    "fmla v28.8h, v5.8h, v12.8h\n"
+    "ldr x26, [x16, #0x60]\n"
+    "fmla v24.8h, v2.8h, v12.8h\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v11.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.h }[2], [x26], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v11.h }[0], [x26], #0x2\n"
+    "23:"  // Oddments: Load input (4, 0): Bit 1: End
+    "fmla v23.8h, v6.8h, v11.8h\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v19.8h, v3.8h, v11.8h\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v10.h }[2], [x25], #0x2\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (1, 2): Bit 1: Unset
+    "ld1 { v10.h }[0], [x25], #0x2\n"
+    "25:"  // Oddments: Load input (1, 2): Bit 1: End
+    "fmla v31.8h, v5.8h, v10.8h\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "add x24, x24, x14\n"
+    "fmla v29.8h, v3.8h, v10.8h\n"
+    "fmla v27.8h, v2.8h, v10.8h\n"
+    "fmla v26.8h, v1.8h, v10.8h\n"
+    "fmla v25.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.h }[2], [x24], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 5): Bit 1: Unset
+    "ld1 { v11.h }[0], [x24], #0x2\n"
+    "27:"  // Oddments: Load input (4, 5): Bit 1: End
+    "fmla v20.8h, v8.8h, v11.8h\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v16.8h, v5.8h, v11.8h\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "29:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "add x10, x10, x14\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v26.8h, v2.8h, v12.8h\n"
+    "fmla v25.8h, v1.8h, v12.8h\n"
+    "fmla v24.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.s }[0], [x10], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.h }[2], [x10], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (5, 1): Bit 1: Unset
+    "ld1 { v11.h }[0], [x10], #0x2\n"
+    "31:"  // Oddments: Load input (5, 1): Bit 1: End
+    "fmla v19.8h, v7.8h, v11.8h\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v18.8h, v6.8h, v11.8h\n"
+    "add x9, x9, x14\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v10.h }[2], [x9], #0x2\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v10.h }[0], [x9], #0x2\n"
+    "33:"  // Oddments: Load input (2, 1): Bit 1: End
+    "fmla v31.8h, v7.8h, v10.8h\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.8h, v6.8h, v10.8h\n"
+    "add x28, x28, x14\n"
+    "fmla v27.8h, v4.8h, v10.8h\n"
+    "fmla v26.8h, v3.8h, v10.8h\n"
+    "fmla v23.8h, v1.8h, v10.8h\n"
+    "fmla v22.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v11.s }[0], [x28], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v11.h }[2], [x28], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (5, 4): Bit 1: Unset
+    "ld1 { v11.h }[0], [x28], #0x2\n"
+    "35:"  // Oddments: Load input (5, 4): Bit 1: End
+    "fmla v17.8h, v8.8h, v11.8h\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v16.8h, v7.8h, v11.8h\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v12.h }[2], [x27], #0x2\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v12.h }[0], [x27], #0x2\n"
+    "37:"  // Oddments: Load input (2, 4): Bit 1: End
+    "fmla v29.8h, v8.8h, v12.8h\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v28.8h, v7.8h, v12.8h\n"
+    "add x26, x26, x14\n"
+    "fmla v25.8h, v5.8h, v12.8h\n"
+    "fmla v24.8h, v4.8h, v12.8h\n"
+    "fmla v21.8h, v2.8h, v12.8h\n"
+    "fmla v20.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v10.h }[2], [x26], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v10.h }[0], [x26], #0x2\n"
+    "39:"  // Oddments: Load input (0, 2): Bit 1: End
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "add x25, x25, x14\n"
+    "fmla v29.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v11.h }[2], [x25], #0x2\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.h }[0], [x25], #0x2\n"
+    "41:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v27.8h, v7.8h, v11.8h\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v26.8h, v6.8h, v11.8h\n"
+    "add x24, x24, x14\n"
+    "fmla v23.8h, v4.8h, v11.8h\n"
+    "fmla v22.8h, v3.8h, v11.8h\n"
+    "fmla v19.8h, v1.8h, v11.8h\n"
+    "fmla v18.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.h }[2], [x24], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (0, 3): Bit 1: Unset
+    "ld1 { v12.h }[0], [x24], #0x2\n"
+    "43:"  // Oddments: Load input (0, 3): Bit 1: End
+    "fmla v30.8h, v2.8h, v12.8h\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "add x23, x23, x14\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v10.h }[2], [x23], #0x2\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v10.h }[0], [x23], #0x2\n"
+    "45:"  // Oddments: Load input (2, 0): Bit 1: End
+    "fmla v31.8h, v6.8h, v10.8h\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v27.8h, v3.8h, v10.8h\n"
+    "add x10, x10, x14\n"
+    "fmla v23.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.s }[0], [x10], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[2], [x10], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v11.h }[0], [x10], #0x2\n"
+    "47:"  // Oddments: Load input (3, 4): Bit 1: End
+    "fmla v25.8h, v8.8h, v11.8h\n"
+    "ldr x9, [x16, #0xc8]\n"
+    "fmla v24.8h, v7.8h, v11.8h\n"
+    "add x9, x9, x14\n"
+    "fmla v21.8h, v5.8h, v11.8h\n"
+    "fmla v20.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v2.8h, v11.8h\n"
+    "fmla v16.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 49f\n"
+    "ld1 { v12.h }[2], [x9], #0x2\n"
+    "b 49f\n"
+    "48:"  // Oddments: Load input (2, 5): Bit 1: Unset
+    "ld1 { v12.h }[0], [x9], #0x2\n"
+    "49:"  // Oddments: Load input (2, 5): Bit 1: End
+    "fmla v28.8h, v8.8h, v12.8h\n"
+    "ldr x28, [x16, #0xd0]\n"
+    "fmla v24.8h, v5.8h, v12.8h\n"
+    "add x28, x28, x14\n"
+    "fmla v20.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v10.s }[0], [x28], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.h }[2], [x28], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v10.h }[0], [x28], #0x2\n"
+    "51:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v27.8h, v6.8h, v10.8h\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla v23.8h, v3.8h, v10.8h\n"
+    "add x27, x27, x14\n"
+    "fmla v19.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.s }[0], [x27], #0x4\n"
+    "tbz %x[n_channels], #0, 53f\n"
+    "ld1 { v11.h }[2], [x27], #0x2\n"
+    "b 53f\n"
+    "52:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v11.h }[0], [x27], #0x2\n"
+    "53:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v23.8h, v8.8h, v11.8h\n"
+    "ldr x26, [x16, #0xe0]\n"
+    "fmla v22.8h, v7.8h, v11.8h\n"
+    "add x26, x26, x14\n"
+    "fmla v21.8h, v6.8h, v11.8h\n"
+    "fmla v19.8h, v5.8h, v11.8h\n"
+    "fmla v18.8h, v4.8h, v11.8h\n"
+    "fmla v17.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[2], [x26], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (3, 5): Bit 1: Unset
+    "ld1 { v12.h }[0], [x26], #0x2\n"
+    "55:"  // Oddments: Load input (3, 5): Bit 1: End
+    "fmla v24.8h, v8.8h, v12.8h\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla v20.8h, v5.8h, v12.8h\n"
+    "add x25, x25, x14\n"
+    "fmla v16.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 57f\n"
+    "ld1 { v10.h }[2], [x25], #0x2\n"
+    "b 57f\n"
+    "56:"  // Oddments: Load input (5, 2): Bit 1: Unset
+    "ld1 { v10.h }[0], [x25], #0x2\n"
+    "57:"  // Oddments: Load input (5, 2): Bit 1: End
+    "fmla v19.8h, v8.8h, v10.8h\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v18.8h, v7.8h, v10.8h\n"
+    "add x24, x24, x14\n"
+    "fmla v17.8h, v6.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.h }[2], [x24], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v11.h }[0], [x24], #0x2\n"
+    "59:"  // Oddments: Load input (4, 3): Bit 1: End
+    "fmla v22.8h, v8.8h, v11.8h\n"
+    "ldr x23, [x16, #0xf8]\n"
+    "fmla v21.8h, v7.8h, v11.8h\n"
+    "add x23, x23, x14\n"
+    "fmla v20.8h, v6.8h, v11.8h\n"
+    "fmla v18.8h, v5.8h, v11.8h\n"
+    "fmla v17.8h, v4.8h, v11.8h\n"
+    "fmla v16.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 61f\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "b 61f\n"
+    "60:"  // Oddments: Load input (5, 3): Bit 1: Unset
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "61:"  // Oddments: Load input (5, 3): Bit 1: End
+    "fmla v18.8h, v8.8h, v12.8h\n"
+    "ldr x10, [x16, #0x100]\n"
+    "fmla v17.8h, v7.8h, v12.8h\n"
+    "add x10, x10, x14\n"
+    "fmla v16.8h, v6.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v10.s }[0], [x10], #0x4\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.h }[2], [x10], #0x2\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (1, 1): Bit 1: Unset
+    "ld1 { v10.h }[0], [x10], #0x2\n"
+    "63:"  // Oddments: Load input (1, 1): Bit 1: End
+    "fmla v31.8h, v4.8h, v10.8h\n"
+    "ldr x9, [x16, #0x108]\n"
+    "fmla v30.8h, v3.8h, v10.8h\n"
+    "add x9, x9, x14\n"
+    "fmla v27.8h, v1.8h, v10.8h\n"
+    "fmla v26.8h, v0.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v11.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 65f\n"
+    "ld1 { v11.h }[2], [x9], #0x2\n"
+    "b 65f\n"
+    "64:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v11.h }[0], [x9], #0x2\n"
+    "65:"  // Oddments: Load input (1, 4): Bit 1: End
+    "fmla v29.8h, v5.8h, v11.8h\n"
+    "ldr x28, [x16, #0x110]\n"
+    "fmla v28.8h, v4.8h, v11.8h\n"
+    "add x28, x28, x14\n"
+    "fmla v25.8h, v2.8h, v11.8h\n"
+    "fmla v24.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v12.s }[0], [x28], #0x4\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.h }[2], [x28], #0x2\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v12.h }[0], [x28], #0x2\n"
+    "67:"  // Oddments: Load input (4, 1): Bit 1: End
+    "fmla v23.8h, v7.8h, v12.8h\n"
+    "ldr x27, [x16, #0x118]\n"
+    "fmla v22.8h, v6.8h, v12.8h\n"
+    "add x27, x27, x14\n"
+    "fmla v19.8h, v4.8h, v12.8h\n"
+    "fmla v18.8h, v3.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v10.s }[0], [x27], #0x4\n"
+    "tbz %x[n_channels], #0, 69f\n"
+    "ld1 { v10.h }[2], [x27], #0x2\n"
+    "b 69f\n"
+    "68:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v10.h }[0], [x27], #0x2\n"
+    "69:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v21.8h, v8.8h, v10.8h\n"
+    "fmla v20.8h, v7.8h, v10.8h\n"
+    "fmla v17.8h, v5.8h, v10.8h\n"
+    "fmla v16.8h, v4.8h, v10.8h\n"
+    "fmax v31.8h, v31.8h, v15.8h\n"
+    "fmax v30.8h, v30.8h, v15.8h\n"
+    "fmax v29.8h, v29.8h, v15.8h\n"
+    "fmin v31.8h, v31.8h, v14.8h\n"
+    "fmin v30.8h, v30.8h, v14.8h\n"
+    "fmin v29.8h, v29.8h, v14.8h\n"
+    "fmax v28.8h, v28.8h, v15.8h\n"
+    "fmax v27.8h, v27.8h, v15.8h\n"
+    "fmax v26.8h, v26.8h, v15.8h\n"
+    "fmin v28.8h, v28.8h, v14.8h\n"
+    "fmin v27.8h, v27.8h, v14.8h\n"
+    "fmin v26.8h, v26.8h, v14.8h\n"
+    "fmax v25.8h, v25.8h, v15.8h\n"
+    "fmax v24.8h, v24.8h, v15.8h\n"
+    "fmax v23.8h, v23.8h, v15.8h\n"
+    "fmin v25.8h, v25.8h, v14.8h\n"
+    "fmin v24.8h, v24.8h, v14.8h\n"
+    "fmin v23.8h, v23.8h, v14.8h\n"
+    "fmax v22.8h, v22.8h, v15.8h\n"
+    "fmax v21.8h, v21.8h, v15.8h\n"
+    "fmax v20.8h, v20.8h, v15.8h\n"
+    "fmin v22.8h, v22.8h, v14.8h\n"
+    "fmin v21.8h, v21.8h, v14.8h\n"
+    "fmin v20.8h, v20.8h, v14.8h\n"
+    "fmax v19.8h, v19.8h, v15.8h\n"
+    "fmax v18.8h, v18.8h, v15.8h\n"
+    "fmax v17.8h, v17.8h, v15.8h\n"
+    "fmin v19.8h, v19.8h, v14.8h\n"
+    "fmin v18.8h, v18.8h, v14.8h\n"
+    "fmin v17.8h, v17.8h, v14.8h\n"
+    "fmax v16.8h, v16.8h, v15.8h\n"
+    "fmin v16.8h, v16.8h, v14.8h\n"
+    "tbz %x[n_channels], #1, 70f\n"
+    "ldr x22, [x17, #0x0]\n"
+    "ldr x21, [x17, #0x8]\n"
+    "add x22, x22, x12\n"
+    "ldr x20, [x17, #0x10]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.s }[0], [x22]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.s }[0], [x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.s }[0], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.s }[0], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.s }[0], [x19]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "add x21, x21, x12\n"
+    "st1 { v23.s }[0], [x22]\n"
+    "ldr x20, [x17, #0x50]\n"
+    "add x20, x20, x12\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x19, [x17, #0x58]\n"
+    "add x19, x19, x12\n"
+    "st1 { v21.s }[0], [x20]\n"
+    "ldr x22, [x17, #0x60]\n"
+    "add x22, x22, x12\n"
+    "st1 { v20.s }[0], [x19]\n"
+    "ldr x21, [x17, #0x68]\n"
+    "add x21, x21, x12\n"
+    "st1 { v19.s }[0], [x22]\n"
+    "ldr x20, [x17, #0x70]\n"
+    "add x20, x20, x12\n"
+    "st1 { v18.s }[0], [x21]\n"
+    "ldr x19, [x17, #0x78]\n"
+    "add x19, x19, x12\n"
+    "st1 { v17.s }[0], [x20]\n"
+    "add x12, x12, #0x4\n"
+    "st1 { v16.s }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ldr x22, [x17, #0x0]\n"
+    "ldr x21, [x17, #0x8]\n"
+    "add x22, x22, x12\n"
+    "ldr x20, [x17, #0x10]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.h }[2], [x22]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.h }[2], [x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.h }[2], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.h }[2], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.h }[2], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.h }[2], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.h }[2], [x19]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "add x21, x21, x12\n"
+    "st1 { v23.h }[2], [x22]\n"
+    "ldr x20, [x17, #0x50]\n"
+    "add x20, x20, x12\n"
+    "st1 { v22.h }[2], [x21]\n"
+    "ldr x19, [x17, #0x58]\n"
+    "add x19, x19, x12\n"
+    "st1 { v21.h }[2], [x20]\n"
+    "ldr x22, [x17, #0x60]\n"
+    "add x22, x22, x12\n"
+    "st1 { v20.h }[2], [x19]\n"
+    "ldr x21, [x17, #0x68]\n"
+    "add x21, x21, x12\n"
+    "st1 { v19.h }[2], [x22]\n"
+    "ldr x20, [x17, #0x70]\n"
+    "add x20, x20, x12\n"
+    "st1 { v18.h }[2], [x21]\n"
+    "ldr x19, [x17, #0x78]\n"
+    "add x19, x19, x12\n"
+    "st1 { v17.h }[2], [x20]\n"
+    "st1 { v16.h }[2], [x19]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Store: Bit 1: Unset
+    "ldr x22, [x17, #0x0]\n"
+    "add x22, x22, x12\n"
+    "ldr x21, [x17, #0x8]\n"
+    "ldr x20, [x17, #0x10]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.h }[0], [x22]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.h }[0], [x20]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.h }[0], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.h }[0], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.h }[0], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.h }[0], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.h }[0], [x19]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "add x21, x21, x12\n"
+    "st1 { v23.h }[0], [x22]\n"
+    "ldr x20, [x17, #0x50]\n"
+    "add x20, x20, x12\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x19, [x17, #0x58]\n"
+    "add x19, x19, x12\n"
+    "st1 { v21.h }[0], [x20]\n"
+    "ldr x22, [x17, #0x60]\n"
+    "add x22, x22, x12\n"
+    "st1 { v20.h }[0], [x19]\n"
+    "ldr x21, [x17, #0x68]\n"
+    "add x21, x21, x12\n"
+    "st1 { v19.h }[0], [x22]\n"
+    "ldr x20, [x17, #0x70]\n"
+    "add x20, x20, x12\n"
+    "st1 { v18.h }[0], [x21]\n"
+    "ldr x19, [x17, #0x78]\n"
+    "add x19, x19, x12\n"
+    "st1 { v17.h }[0], [x20]\n"
+    "st1 { v16.h }[0], [x19]\n"
+    "71:"  // Oddments: Store: Bit 1: End
+
+    "72:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..ca367cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..32a6fb9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x6, #0x0\n"
+    "mov x27, #0x0\n"
+    "1:"  // Tile loop
+    "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x4\n"
+    "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x25, #0x2\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x24, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x22, #0x0\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x6, x23\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x27, x8, x19\n" // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x17, x17, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1r { v19.8h }, [x24]\n"
+    "add x14, x17, x23, LSL #1\n"
+    "ld1r { v18.8h }, [x21]\n"
+    "add x13, x14, x23, LSL #1\n"
+    "lsl x8, x8, #0x1\n"
+    "add x12, x13, x23, LSL #1\n"
+    "add x11, x12, x23, LSL #1\n"
+    "add x10, x8, x8\n"
+    "add x9, x10, x8\n"
+    "add x28, x9, x8\n"
+    "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x27, x16, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x25\n" // offset *= output_tile_size
+    "add x15, x15, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "add x27, x15, x20, LSL #1\n"
+    "lsl x16, x16, #0x1\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x3\n"
+    "cbz x19, 4f\n"
+    "ldr q17, [x7, #0x0]\n"
+    "ldr q0, [x7, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x7, #0x20]\n"
+    "ldr q2, [x7, #0x30]\n"
+    "ldr q3, [x7, #0x40]\n"
+    "ldr q4, [x7, #0x50]\n"
+    "ldr q5, [x7, #0x60]\n"
+    "ldr q6, [x7, #0x70]\n"
+    "ldr q7, [x7, #0x80]\n"
+    "ldr q8, [x7, #0x90]\n"
+    "add x7, x7, #0xa0\n"
+    "ldr q9, [x13, x10]\n"
+    "ld1 { v10.8h }, [x17]\n"
+    "ldr q11, [x17, x8]\n"
+    "ldr q12, [x17, x9]\n"
+    "ldr q13, [x17, x28]\n"
+    "ld1 { v14.8h }, [x14]\n"
+    "ldr q15, [x14, x8]\n"
+    "ldr q16, [x17, x10]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+    "add x22, x22, #0x10\n"
+    "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "add x17, x17, #0x10\n"
+    "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "ldr q17, [x7, #0x0]\n"
+    "add x21, x21, #0x10\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ld1 { v10.8h }, [x17]\n"
+    "cmp x21, x19, LSL #4\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x14, x28]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x14, x9]\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x14, x10]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.8h, v3.8h, v14.8h\n"
+    "ld1 { v14.8h }, [x12]\n"
+    "fmla v30.8h, v0.8h, v16.8h\n"
+    "fmla v31.8h, v4.8h, v15.8h\n"
+    "ld1 { v15.8h }, [x13]\n"
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ldr q14, [x12, x28]\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "ldr q11, [x12, x8]\n"
+    "fmla v31.8h, v2.8h, v16.8h\n"
+    "ldr q16, [x13, x8]\n"
+    "fmla v29.8h, v0.8h, v15.8h\n"
+    "ldr q0, [x7, #0x10]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q12, [x13, x9]\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "ldr q11, [x13, x28]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "ldr q13, [x12, x9]\n"
+    "ldr q9, [x13, x10]\n"
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "ld1 { v15.8h }, [x11]\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x11, x8]\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "ldr q4, [x7, #0x50]\n"
+    "fmla v31.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x12, x10]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.8h, v6.8h, v15.8h\n"
+    "ldr q15, [x11, x10]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x17, x9]\n"
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "ldr q1, [x7, #0x20]\n"
+    "fmax v31.8h, v31.8h, v19.8h\n"
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "ldr q13, [x17, x28]\n"
+    "fmla v28.8h, v5.8h, v14.8h\n"
+    "ldr q14, [x11, x9]\n"
+    "fmax v30.8h, v30.8h, v19.8h\n"
+    "fmin v31.8h, v31.8h, v18.8h\n"
+    "st1 { v31.8h }, [x15]\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "ldr q11, [x11, x28]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v30.8h, v30.8h, v18.8h\n"
+    "ldr q2, [x7, #0x30]\n"
+    "ldr q5, [x7, #0x60]\n"
+    "fmla v28.8h, v3.8h, v16.8h\n"
+    "ldr q16, [x17, x10]\n"
+    "fmla v29.8h, v8.8h, v15.8h\n"
+    "str q30, [x15, x16]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v28.8h, v7.8h, v14.8h\n"
+    "ld1 { v14.8h }, [x14]\n"
+    "fmax v29.8h, v29.8h, v19.8h\n"
+    "ldr q3, [x7, #0x40]\n"
+    "ldr q7, [x7, #0x80]\n"
+    "fmin v29.8h, v29.8h, v18.8h\n"
+    "st1 { v29.8h }, [x27]\n"
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "ldr q15, [x14, x8]\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x17, x8]\n"
+    "ldr q6, [x7, #0x70]\n"
+    "fmax v28.8h, v28.8h, v19.8h\n"
+    "ldr q8, [x7, #0x90]\n"
+    "add x7, x7, #0xa0\n"
+    "fmin v28.8h, v28.8h, v18.8h\n"
+    "str q28, [x27, x16]\n"
+    "add x27, x27, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "add x17, x17, #0x10\n"
+    "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+    "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x14, x28]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x14, x9]\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x14, x10]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.8h, v3.8h, v14.8h\n"
+    "ld1 { v14.8h }, [x12]\n"
+    "fmla v30.8h, v0.8h, v16.8h\n"
+    "fmla v31.8h, v4.8h, v15.8h\n"
+    "ld1 { v15.8h }, [x13]\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "ldr q11, [x12, x8]\n"
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ldr q14, [x12, x28]\n"
+    "fmla v31.8h, v2.8h, v16.8h\n"
+    "ldr q16, [x13, x8]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q12, [x13, x9]\n"
+    "fmla v29.8h, v0.8h, v15.8h\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "ldr q13, [x12, x9]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "ldr q11, [x13, x28]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "ld1 { v15.8h }, [x11]\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x11, x8]\n"
+    "fmla v31.8h, v7.8h, v16.8h\n"
+    "ldr q16, [x12, x10]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.8h, v6.8h, v15.8h\n"
+    "ldr q15, [x11, x10]\n"
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmax v31.8h, v31.8h, v19.8h\n"
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "fmax v30.8h, v30.8h, v19.8h\n"
+    "fmla v28.8h, v5.8h, v14.8h\n"
+    "ldr q14, [x11, x9]\n"
+    "fmin v31.8h, v31.8h, v18.8h\n"
+    "st1 { v31.8h }, [x15]\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "ldr q11, [x11, x28]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v30.8h, v30.8h, v18.8h\n"
+    "str q30, [x15, x16]\n"
+    "fmla v28.8h, v3.8h, v16.8h\n"
+    "add x15, x15, #0x10\n"
+    "fmla v29.8h, v8.8h, v15.8h\n"
+    "fmla v28.8h, v7.8h, v14.8h\n"
+    "fmax v29.8h, v29.8h, v19.8h\n"
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "fmin v29.8h, v29.8h, v18.8h\n"
+    "st1 { v29.8h }, [x27]\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "fmax v28.8h, v28.8h, v19.8h\n"
+    "fmin v28.8h, v28.8h, v18.8h\n"
+    "str q28, [x27, x16]\n"
+    "add x27, x27, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 43f\n"
+    "ldr q17, [x7, #0x0]\n"
+    "ldr q0, [x7, #0x10]\n"
+    "add x26, x13, x10\n"
+    "ldr q1, [x7, #0x20]\n"
+    "add x25, x17, XZR\n"
+    "ldr q2, [x7, #0x30]\n"
+    "add x24, x17, x8\n"
+    "ldr q3, [x7, #0x40]\n"
+    "add x23, x17, x9\n"
+    "ldr q4, [x7, #0x50]\n"
+    "add x22, x17, x28\n"
+    "ldr q5, [x7, #0x60]\n"
+    "add x21, x14, XZR\n"
+    "ldr q6, [x7, #0x70]\n"
+    "add x20, x14, x8\n"
+    "ldr q7, [x7, #0x80]\n"
+    "add x19, x17, x10\n"
+    "ldr q8, [x7, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr s9, [x26], #0x4\n"
+    "ldr s10, [x25], #0x4\n"
+    "ldr s11, [x24], #0x4\n"
+    "ldr s12, [x23], #0x4\n"
+    "ldr s13, [x22], #0x4\n"
+    "ldr s14, [x21], #0x4\n"
+    "ldr s15, [x20], #0x4\n"
+    "ldr s16, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.h }[2], [x26]\n"
+    "ld1 { v10.h }[2], [x25]\n"
+    "ld1 { v11.h }[2], [x24]\n"
+    "ld1 { v12.h }[2], [x23]\n"
+    "ld1 { v13.h }[2], [x22]\n"
+    "ld1 { v14.h }[2], [x21]\n"
+    "ld1 { v15.h }[2], [x20]\n"
+    "ld1 { v16.h }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+    "ldr h9, [x26, #0x0]\n"
+    "ldr h10, [x25, #0x0]\n"
+    "ldr h11, [x24, #0x0]\n"
+    "ldr h12, [x23, #0x0]\n"
+    "ldr h13, [x22, #0x0]\n"
+    "ldr h14, [x21, #0x0]\n"
+    "ldr h15, [x20, #0x0]\n"
+    "ldr h16, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+    "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "add x19, x14, x9\n"
+    "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+    "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "fmla v31.8h, v3.8h, v14.8h\n"
+    "fmla v30.8h, v0.8h, v16.8h\n"
+    "fmla v31.8h, v4.8h, v15.8h\n"
+    "fmla v31.8h, v2.8h, v16.8h\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "add x19, x14, x28\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "add x19, x14, x10\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "add x19, x12, XZR\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr s14, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.h }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr h14, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "add x19, x13, XZR\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s15, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v15.h }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr h15, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "add x19, x12, x8\n"
+    "fmla v29.8h, v0.8h, v15.8h\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "add x19, x13, x8\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s16, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v16.h }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr h16, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v31.8h, v7.8h, v16.8h\n"
+    "add x19, x12, x9\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "add x19, x13, x9\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "add x19, x12, x28\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr s14, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v14.h }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr h14, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v28.8h, v5.8h, v14.8h\n"
+    "add x19, x11, XZR\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s15, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.h }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr h15, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v29.8h, v6.8h, v15.8h\n"
+    "add x19, x13, x28\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "add x19, x11, x8\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "add x19, x12, x10\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr s16, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v16.h }[2], [x19]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr h16, [x19, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "add x19, x11, x9\n"
+    "fmla v28.8h, v3.8h, v16.8h\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s14, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v14.h }[2], [x19]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr h14, [x19, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v28.8h, v7.8h, v14.8h\n"
+    "add x19, x11, x10\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr s15, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v15.h }[2], [x19]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr h15, [x19, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v29.8h, v8.8h, v15.8h\n"
+    "add x19, x11, x28\n"
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "fmax v31.8h, v31.8h, v19.8h\n"
+    "fmax v30.8h, v30.8h, v19.8h\n"
+    "fmax v29.8h, v29.8h, v19.8h\n"
+    "fmin v31.8h, v31.8h, v18.8h\n"
+    "fmin v30.8h, v30.8h, v18.8h\n"
+    "fmin v29.8h, v29.8h, v18.8h\n"
+    "fmax v28.8h, v28.8h, v19.8h\n"
+    "fmin v28.8h, v28.8h, v18.8h\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "mov x19, x15\n"
+    "st1 { v31.s }[0], [x19], x16\n"
+    "add x15, x15, #0x4\n"
+    "st1 { v30.s }[0], [x19]\n"
+    "mov x19, x27\n"
+    "st1 { v29.s }[0], [x19], x16\n"
+    "add x27, x27, #0x4\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "mov x20, x15\n"
+    "st1 { v31.h }[2], [x20], x16\n"
+    "mov x19, x27\n"
+    "st1 { v30.h }[2], [x20]\n"
+    "st1 { v29.h }[2], [x19], x16\n"
+    "st1 { v28.h }[2], [x19]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x20, x15\n"
+    "st1 { v31.h }[0], [x20], x16\n"
+    "mov x19, x27\n"
+    "st1 { v30.h }[0], [x20]\n"
+    "st1 { v29.h }[0], [x19], x16\n"
+    "st1 { v28.h }[0], [x19]\n"
+    "42:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "43:"  // Tile loop: End
+    "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x6, #0x1\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x27, x27, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x27, x19\n"
+    "csel x27, x27, XZR, LT\n"
+    "csel x6, x6, x21, LT\n"
+    "cmp x6, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..f071e21
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v19.8h }, [x20]\n"
+    "ld1r { v18.8h }, [x19]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x21, #0x0]\n"
+    "mov x11, #0x10\n" // cntb _, ALL, #1
+    "ldp x10, x9, [x21, #0x10]\n"
+    "sub x28, XZR, x11\n"
+    "lsr x27, %x[n_channels], #0x3\n"
+    "cbz x27, 3f\n"
+    "ldr q17, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x11, x27, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr q10, [x25, x14]\n"
+    "ldr q11, [x24, x14]\n"
+    "ldr q12, [x23, x14]\n"
+    "ldr q13, [x22, x14]\n"
+    "ldr q14, [x21, x14]\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "ldr q15, [x20, x14]\n"
+    "ldr q16, [x19, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x26, [x16, #0x40]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+    "ldr x25, [x16, #0x48]\n"
+    "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x25, x14]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x26, x14]\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x24, x14]\n"
+    "fmla v31.8h, v3.8h, v14.8h\n"
+    "ldr q14, [x23, x14]\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v30.8h, v0.8h, v16.8h\n"
+    "ldr x26, [x16, #0x80]\n"
+    "fmla v31.8h, v4.8h, v15.8h\n"
+    "ldr q15, [x22, x14]\n"
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr x24, [x16, #0x90]\n"
+    "fmla v31.8h, v2.8h, v16.8h\n"
+    "ldr q16, [x20, x14]\n"
+    "fmla v29.8h, v0.8h, v15.8h\n"
+    "ldr q14, [x25, x14]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q12, [x26, x14]\n"
+    "ldr x23, [x16, #0x98]\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "ldr q11, [x23, x14]\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "ldr q13, [x19, x14]\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "ldr q15, [x24, x14]\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x22, x14]\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v31.8h, v7.8h, v16.8h\n"
+    "fmla v29.8h, v6.8h, v15.8h\n"
+    "ldr q16, [x21, x14]\n"
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "ldr q15, [x19, x14]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q17, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v19.8h\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmax v30.8h, v30.8h, v19.8h\n"
+    "ldr q4, [x15, #0x50]\n"
+    "fmla v28.8h, v5.8h, v14.8h\n"
+    "ldr q14, [x20, x14]\n"
+    "fmin v31.8h, v31.8h, v18.8h\n"
+    "str q31, [x13, x28]\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "ldr q11, [x26, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmin v30.8h, v30.8h, v18.8h\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "fmla v28.8h, v3.8h, v16.8h\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "fmla v29.8h, v8.8h, v15.8h\n"
+    "ldr q9, [x26, x11]\n"
+    "ldr q10, [x25, x11]\n"
+    "fmla v28.8h, v7.8h, v14.8h\n"
+    "ldr q12, [x23, x11]\n"
+    "fmax v29.8h, v29.8h, v19.8h\n"
+    "ldr q13, [x22, x11]\n"
+    "ldr q14, [x21, x11]\n"
+    "fmin v29.8h, v29.8h, v18.8h\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "str q30, [x12, x28]\n"
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "ldr q11, [x24, x11]\n"
+    "ldr q15, [x20, x11]\n"
+    "fmax v28.8h, v28.8h, v19.8h\n"
+    "ldr q16, [x19, x11]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v28.8h, v28.8h, v18.8h\n"
+    "str q29, [x10, x28]\n"
+    "cmp x11, x27, LSL #4\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "str q28, [x9, x28]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x26, [x16, #0x40]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+    "ldr x25, [x16, #0x48]\n"
+    "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x25, x14]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x26, x14]\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x24, x14]\n"
+    "fmla v31.8h, v3.8h, v14.8h\n"
+    "ldr q14, [x23, x14]\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v30.8h, v0.8h, v16.8h\n"
+    "ldr x26, [x16, #0x80]\n"
+    "fmla v31.8h, v4.8h, v15.8h\n"
+    "ldr q15, [x22, x14]\n"
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr x24, [x16, #0x90]\n"
+    "fmla v31.8h, v2.8h, v16.8h\n"
+    "ldr q16, [x20, x14]\n"
+    "fmla v29.8h, v0.8h, v15.8h\n"
+    "ldr q14, [x25, x14]\n"
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr q12, [x26, x14]\n"
+    "ldr x23, [x16, #0x98]\n"
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "ldr q11, [x23, x14]\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "ldr q13, [x19, x14]\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "ldr q15, [x24, x14]\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x22, x14]\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v31.8h, v7.8h, v16.8h\n"
+    "fmla v29.8h, v6.8h, v15.8h\n"
+    "ldr q16, [x21, x14]\n"
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "ldr q15, [x19, x14]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "fmax v31.8h, v31.8h, v19.8h\n"
+    "fmax v30.8h, v30.8h, v19.8h\n"
+    "fmla v28.8h, v5.8h, v14.8h\n"
+    "ldr q14, [x20, x14]\n"
+    "fmin v31.8h, v31.8h, v18.8h\n"
+    "str q31, [x13, x28]\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "ldr q11, [x26, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmin v30.8h, v30.8h, v18.8h\n"
+    "str q30, [x12, x28]\n"
+    "fmla v28.8h, v3.8h, v16.8h\n"
+    "fmla v29.8h, v8.8h, v15.8h\n"
+    "fmla v28.8h, v7.8h, v14.8h\n"
+    "fmax v29.8h, v29.8h, v19.8h\n"
+    "fmin v29.8h, v29.8h, v18.8h\n"
+    "str q29, [x10, x28]\n"
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "fmax v28.8h, v28.8h, v19.8h\n"
+    "fmin v28.8h, v28.8h, v18.8h\n"
+    "str q28, [x9, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 42f\n"
+    "ldr q17, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x28, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "add x13, x13, x28\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x12, x12, x28\n"
+    "ldr q3, [x15, #0x40]\n"
+    "add x10, x10, x28\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x9, x9, x28\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "ldr x26, [x16, #0x0]\n"
+    "ldr x25, [x16, #0x8]\n"
+    "ldr x24, [x16, #0x10]\n"
+    "add x26, x26, x14\n"
+    "ldr x23, [x16, #0x18]\n"
+    "add x25, x25, x14\n"
+    "ldr x22, [x16, #0x20]\n"
+    "add x24, x24, x14\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x23, x23, x14\n"
+    "ldr x20, [x16, #0x30]\n"
+    "add x22, x22, x14\n"
+    "ldr x19, [x16, #0x38]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v13.s }[0], [x22], #0x4\n"
+    "ld1 { v14.s }[0], [x21], #0x4\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "ld1 { v16.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.h }[2], [x26], #0x2\n"
+    "ld1 { v10.h }[2], [x25], #0x2\n"
+    "ld1 { v11.h }[2], [x24], #0x2\n"
+    "ld1 { v12.h }[2], [x23], #0x2\n"
+    "ld1 { v13.h }[2], [x22], #0x2\n"
+    "ld1 { v14.h }[2], [x21], #0x2\n"
+    "ld1 { v15.h }[2], [x20], #0x2\n"
+    "ld1 { v16.h }[2], [x19], #0x2\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+    "ld1 { v9.h }[0], [x26], #0x2\n"
+    "ld1 { v10.h }[0], [x25], #0x2\n"
+    "ld1 { v11.h }[0], [x24], #0x2\n"
+    "ld1 { v12.h }[0], [x23], #0x2\n"
+    "ld1 { v13.h }[0], [x22], #0x2\n"
+    "ld1 { v14.h }[0], [x21], #0x2\n"
+    "ld1 { v15.h }[0], [x20], #0x2\n"
+    "ld1 { v16.h }[0], [x19], #0x2\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+    "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+    "ldr x26, [x16, #0x40]\n"
+    "add x26, x26, x14\n"
+    "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+    "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+    "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+    "fmla v31.8h, v0.8h, v10.8h\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "fmla v30.8h, v2.8h, v13.8h\n"
+    "fmla v31.8h, v3.8h, v14.8h\n"
+    "fmla v30.8h, v0.8h, v16.8h\n"
+    "fmla v31.8h, v4.8h, v15.8h\n"
+    "fmla v31.8h, v2.8h, v16.8h\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v11.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v11.h }[2], [x26], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v11.h }[0], [x26], #0x2\n"
+    "7:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v30.8h, v4.8h, v11.8h\n"
+    "ldr x25, [x16, #0x48]\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v12.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v12.h }[2], [x25], #0x2\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v12.h }[0], [x25], #0x2\n"
+    "9:"  // Oddments: Load input (1, 4): Bit 1: End
+    "fmla v30.8h, v5.8h, v12.8h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "add x24, x24, x14\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v13.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v13.h }[2], [x24], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (1, 2): Bit 1: Unset
+    "ld1 { v13.h }[0], [x24], #0x2\n"
+    "11:"  // Oddments: Load input (1, 2): Bit 1: End
+    "fmla v31.8h, v5.8h, v13.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v14.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v14.h }[2], [x23], #0x2\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v14.h }[0], [x23], #0x2\n"
+    "13:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v29.8h, v3.8h, v14.8h\n"
+    "ldr x22, [x16, #0x60]\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v15.s }[0], [x22], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v15.h }[2], [x22], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v15.h }[0], [x22], #0x2\n"
+    "15:"  // Oddments: Load input (2, 0): Bit 1: End
+    "fmla v31.8h, v6.8h, v15.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v29.8h, v0.8h, v15.8h\n"
+    "add x21, x21, x14\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.h }[2], [x21], #0x2\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.h }[0], [x21], #0x2\n"
+    "17:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v29.8h, v4.8h, v11.8h\n"
+    "ldr x20, [x16, #0x70]\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v16.h }[2], [x20], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v16.h }[0], [x20], #0x2\n"
+    "19:"  // Oddments: Load input (2, 1): Bit 1: End
+    "fmla v31.8h, v7.8h, v16.8h\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v29.8h, v1.8h, v16.8h\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v13.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v13.h }[2], [x19], #0x2\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v13.h }[0], [x19], #0x2\n"
+    "21:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v28.8h, v4.8h, v13.8h\n"
+    "ldr x26, [x16, #0x80]\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v12.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.h }[2], [x26], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v12.h }[0], [x26], #0x2\n"
+    "23:"  // Oddments: Load input (2, 3): Bit 1: End
+    "fmla v30.8h, v7.8h, v12.8h\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v14.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v14.h }[2], [x25], #0x2\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v14.h }[0], [x25], #0x2\n"
+    "25:"  // Oddments: Load input (3, 4): Bit 1: End
+    "fmla v28.8h, v5.8h, v14.8h\n"
+    "ldr x24, [x16, #0x90]\n"
+    "add x24, x24, x14\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v15.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.h }[2], [x24], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v15.h }[0], [x24], #0x2\n"
+    "27:"  // Oddments: Load input (4, 0): Bit 1: End
+    "fmla v29.8h, v6.8h, v15.8h\n"
+    "ldr x23, [x16, #0x98]\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v11.h }[2], [x23], #0x2\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v11.h }[0], [x23], #0x2\n"
+    "29:"  // Oddments: Load input (2, 4): Bit 1: End
+    "fmla v30.8h, v8.8h, v11.8h\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v13.s }[0], [x22], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v13.h }[2], [x22], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.h }[0], [x22], #0x2\n"
+    "31:"  // Oddments: Load input (4, 1): Bit 1: End
+    "fmla v29.8h, v7.8h, v13.8h\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "add x21, x21, x14\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v16.s }[0], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v16.h }[2], [x21], #0x2\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v16.h }[0], [x21], #0x2\n"
+    "33:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v29.8h, v5.8h, v16.8h\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v28.8h, v3.8h, v16.8h\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v14.h }[2], [x20], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v14.h }[0], [x20], #0x2\n"
+    "35:"  // Oddments: Load input (4, 3): Bit 1: End
+    "fmla v28.8h, v7.8h, v14.8h\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v15.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v15.h }[2], [x19], #0x2\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v15.h }[0], [x19], #0x2\n"
+    "37:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v29.8h, v8.8h, v15.8h\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v28.8h, v6.8h, v15.8h\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v11.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v11.h }[2], [x26], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v11.h }[0], [x26], #0x2\n"
+    "39:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v28.8h, v8.8h, v11.8h\n"
+    "fmax v31.8h, v31.8h, v19.8h\n"
+    "fmax v30.8h, v30.8h, v19.8h\n"
+    "fmax v29.8h, v29.8h, v19.8h\n"
+    "fmin v31.8h, v31.8h, v18.8h\n"
+    "fmin v30.8h, v30.8h, v18.8h\n"
+    "fmin v29.8h, v29.8h, v18.8h\n"
+    "fmax v28.8h, v28.8h, v19.8h\n"
+    "fmin v28.8h, v28.8h, v18.8h\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "st1 { v31.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x10], #0x4\n"
+    "st1 { v28.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "st1 { v31.h }[2], [x13], #0x2\n"
+    "st1 { v30.h }[2], [x12], #0x2\n"
+    "st1 { v29.h }[2], [x10], #0x2\n"
+    "st1 { v28.h }[2], [x9], #0x2\n"
+    "b 41f\n"
+    "40:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v31.h }[0], [x13], #0x2\n"
+    "st1 { v30.h }[0], [x12], #0x2\n"
+    "st1 { v29.h }[0], [x10], #0x2\n"
+    "st1 { v28.h }[0], [x9], #0x2\n"
+    "41:"  // Oddments: Store: Bit 1: End
+
+    "42:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..53d2a3a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  indirect_kern_type indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..ec5f97a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x28, #0x0\n"
+    "mov x27, #0x0\n"
+    "1:"  // Tile loop
+    "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x2\n"
+    "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x25, #0x2\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x24, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x22, #0x0\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x28, x23\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x27, x4, x19\n" // offset += tile_j * ld_input_col
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+    "ldr x7, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x5, x5, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1r { v18.8h }, [x24]\n"
+    "add x8, x5, x23, LSL #1\n"
+    "ld1r { v17.8h }, [x21]\n"
+    "add x17, x8, x23, LSL #1\n"
+    "lsl x4, x4, #0x1\n"
+    "add x16, x17, x23, LSL #1\n"
+    "add x15, x16, x23, LSL #1\n"
+    "add x14, x15, x23, LSL #1\n"
+    "add x13, x4, x4\n"
+    "add x12, x13, x4\n"
+    "add x11, x12, x4\n"
+    "add x10, x11, x4\n"
+    "mul x19, x28, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x27, x6, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x25\n" // offset *= output_tile_size
+    "add x7, x7, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "add x9, x7, x20, LSL #1\n"
+    "lsl x6, x6, #0x1\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x3\n"
+    "cbz x19, 4f\n"
+    "ldr q16, [x3, #0x0]\n"
+    "ldr q0, [x3, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x3, #0x20]\n"
+    "ldr q2, [x3, #0x30]\n"
+    "ldr q3, [x3, #0x40]\n"
+    "ldr q4, [x3, #0x50]\n"
+    "add x3, x3, #0x60\n"
+    "ld1 { v5.8h }, [x5]\n"
+    "ldr q6, [x5, x4]\n"
+    "ld1 { v7.8h }, [x8]\n"
+    "ldr q8, [x8, x4]\n"
+    "ldr q9, [x5, x13]\n"
+    "ldr q13, [x8, x13]\n"
+    "ldr q11, [x5, x12]\n"
+    "ldr q12, [x5, x11]\n"
+    "ldr q10, [x8, x10]\n"
+    "ld1 { v14.8h }, [x17]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr q5, [x8, x12]\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+    "add x22, x22, #0x10\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+    "add x21, x21, #0x10\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+    "ldr q0, [x3, #0x0]\n"
+    "cmp x21, x19, LSL #4\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q6, [x8, x11]\n"
+    "add x8, x8, #0x10\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "ldr q16, [x3, #0x140]\n"
+    "fmla v29.8h, v1.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "ldr q1, [x3, #0x10]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x5, x10]\n"
+    "add x5, x5, #0x10\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "ldr q2, [x3, #0x20]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x17, x4]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v5.8h\n"
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "ldr q3, [x3, #0x30]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x17, x13]\n"
+    "fmla v30.8h, v4.8h, v9.8h\n"
+    "ldr q9, [x17, x12]\n"
+    "fmla v29.8h, v4.8h, v6.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x3, #0x40]\n"
+    "fmla v31.8h, v0.8h, v7.8h\n"
+    "ld1 { v7.8h }, [x8]\n"
+    "fmla v30.8h, v0.8h, v8.8h\n"
+    "fmla v29.8h, v0.8h, v14.8h\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr q0, [x3, #0x50]\n"
+    "fmla v31.8h, v1.8h, v8.8h\n"
+    "ldr q8, [x17, x10]\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q1, [x3, #0x60]\n"
+    "fmla v31.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x17, x11]\n"
+    "add x17, x17, #0x10\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "ldr q2, [x3, #0x70]\n"
+    "fmla v31.8h, v3.8h, v5.8h\n"
+    "ld1 { v5.8h }, [x16]\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "ldr q3, [x3, #0x80]\n"
+    "fmla v31.8h, v4.8h, v6.8h\n"
+    "ldr q6, [x16, x4]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "ldr q10, [x16, x13]\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "ldr q4, [x3, #0x90]\n"
+    "fmla v31.8h, v0.8h, v14.8h\n"
+    "ldr q14, [x16, x10]\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v5.8h\n"
+    "fmla v28.8h, v0.8h, v6.8h\n"
+    "ldr q0, [x3, #0xa0]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x16, x12]\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v10.8h\n"
+    "ldr q1, [x3, #0xb0]\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x16, x11]\n"
+    "add x16, x16, #0x10\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x3, #0xc0]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x15]\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr q3, [x3, #0xd0]\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x15, x4]\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "ldr q8, [x15, x11]\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v14.8h\n"
+    "ldr q4, [x3, #0xe0]\n"
+    "fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr q5, [x15, x13]\n"
+    "fmla v30.8h, v0.8h, v6.8h\n"
+    "fmla v29.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "ldr q0, [x3, #0xf0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q6, [x15, x12]\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v5.8h\n"
+    "ldr q1, [x3, #0x100]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "ldr q10, [x15, x10]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "fmla v28.8h, v2.8h, v6.8h\n"
+    "ldr q2, [x3, #0x110]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ld1 { v11.8h }, [x14]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "fmla v28.8h, v3.8h, v8.8h\n"
+    "ldr q3, [x3, #0x120]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x14, x4]\n"
+    "fmla v30.8h, v4.8h, v14.8h\n"
+    "ld1 { v14.8h }, [x17]\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x3, #0x130]\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x14, x13]\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x14, x12]\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "ldr q0, [x3, #0x150]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "ldr q13, [x8, x13]\n"
+    "fmla v30.8h, v1.8h, v5.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x14, x11]\n"
+    "fmla v28.8h, v1.8h, v9.8h\n"
+    "ldr q1, [x3, #0x160]\n"
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "ld1 { v5.8h }, [x5]\n"
+    "fmla v30.8h, v2.8h, v6.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x14, x10]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x3, #0x170]\n"
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "ldr q6, [x5, x4]\n"
+    "fmla v30.8h, v3.8h, v8.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x5, x12]\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr q3, [x3, #0x180]\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "ldr q8, [x8, x4]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "ldr q10, [x8, x10]\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x5, x11]\n"
+    "fmla v28.8h, v4.8h, v9.8h\n"
+    "ldr q9, [x5, x13]\n"
+    "ldr q4, [x3, #0x190]\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "add x3, x3, #0x1a0\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "st1 { v31.8h }, [x7]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "str q30, [x7, x6]\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "add x7, x7, #0x10\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "st1 { v29.8h }, [x9]\n"
+    "str q28, [x9, x6]\n"
+    "add x9, x9, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr q5, [x8, x12]\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+    "ldr q0, [x3, #0x0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q6, [x8, x11]\n"
+    "add x8, x8, #0x10\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "ldr q1, [x3, #0x10]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x5, x10]\n"
+    "add x5, x5, #0x10\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "ldr q2, [x3, #0x20]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x17, x4]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v5.8h\n"
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "ldr q3, [x3, #0x30]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x17, x13]\n"
+    "fmla v30.8h, v4.8h, v9.8h\n"
+    "ldr q9, [x17, x12]\n"
+    "fmla v29.8h, v4.8h, v6.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x3, #0x40]\n"
+    "fmla v31.8h, v0.8h, v7.8h\n"
+    "fmla v30.8h, v0.8h, v8.8h\n"
+    "fmla v29.8h, v0.8h, v14.8h\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr q0, [x3, #0x50]\n"
+    "fmla v31.8h, v1.8h, v8.8h\n"
+    "ldr q8, [x17, x10]\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q1, [x3, #0x60]\n"
+    "fmla v31.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x17, x11]\n"
+    "add x17, x17, #0x10\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "ldr q2, [x3, #0x70]\n"
+    "fmla v31.8h, v3.8h, v5.8h\n"
+    "ld1 { v5.8h }, [x16]\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "ldr q3, [x3, #0x80]\n"
+    "fmla v31.8h, v4.8h, v6.8h\n"
+    "ldr q6, [x16, x4]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "ldr q10, [x16, x13]\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "ldr q4, [x3, #0x90]\n"
+    "fmla v31.8h, v0.8h, v14.8h\n"
+    "ldr q14, [x16, x10]\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v5.8h\n"
+    "fmla v28.8h, v0.8h, v6.8h\n"
+    "ldr q0, [x3, #0xa0]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x16, x12]\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v10.8h\n"
+    "ldr q1, [x3, #0xb0]\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x16, x11]\n"
+    "add x16, x16, #0x10\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x3, #0xc0]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "ld1 { v9.8h }, [x15]\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr q3, [x3, #0xd0]\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x15, x4]\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "ldr q8, [x15, x11]\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v14.8h\n"
+    "ldr q4, [x3, #0xe0]\n"
+    "fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr q5, [x15, x13]\n"
+    "fmla v30.8h, v0.8h, v6.8h\n"
+    "fmla v29.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "ldr q0, [x3, #0xf0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q6, [x15, x12]\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v5.8h\n"
+    "ldr q1, [x3, #0x100]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "ldr q10, [x15, x10]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "fmla v28.8h, v2.8h, v6.8h\n"
+    "ldr q2, [x3, #0x110]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ld1 { v11.8h }, [x14]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "fmla v28.8h, v3.8h, v8.8h\n"
+    "ldr q3, [x3, #0x120]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x14, x4]\n"
+    "fmla v30.8h, v4.8h, v14.8h\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x3, #0x130]\n"
+    "add x3, x3, #0x140\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x14, x13]\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x14, x12]\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v1.8h, v5.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x14, x11]\n"
+    "fmla v28.8h, v1.8h, v9.8h\n"
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "fmla v30.8h, v2.8h, v6.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x14, x10]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "fmla v30.8h, v3.8h, v8.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v9.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "st1 { v31.8h }, [x7]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "str q30, [x7, x6]\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "add x7, x7, #0x10\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "st1 { v29.8h }, [x9]\n"
+    "str q28, [x9, x6]\n"
+    "add x9, x9, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 61f\n"
+    "ldr q16, [x3, #0x0]\n"
+    "ldr q0, [x3, #0x10]\n"
+    "add x28, x5, XZR\n"
+    "ldr q1, [x3, #0x20]\n"
+    "add x27, x5, x4\n"
+    "ldr q2, [x3, #0x30]\n"
+    "add x26, x8, XZR\n"
+    "ldr q3, [x3, #0x40]\n"
+    "add x25, x8, x4\n"
+    "ldr q4, [x3, #0x50]\n"
+    "add x24, x5, x13\n"
+    "add x23, x8, x13\n"
+    "add x22, x5, x12\n"
+    "add x21, x5, x11\n"
+    "add x20, x8, x10\n"
+    "add x19, x17, XZR\n"
+    "add x3, x3, #0x60\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr s5, [x28], #0x4\n"
+    "ldr s6, [x27], #0x4\n"
+    "ldr s7, [x26], #0x4\n"
+    "ldr s8, [x25], #0x4\n"
+    "ldr s9, [x24], #0x4\n"
+    "ldr s13, [x23], #0x4\n"
+    "ldr s11, [x22], #0x4\n"
+    "ldr s12, [x21], #0x4\n"
+    "ldr s10, [x20], #0x4\n"
+    "ldr s14, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v5.h }[2], [x28]\n"
+    "ld1 { v6.h }[2], [x27]\n"
+    "ld1 { v7.h }[2], [x26]\n"
+    "ld1 { v8.h }[2], [x25]\n"
+    "ld1 { v9.h }[2], [x24]\n"
+    "ld1 { v13.h }[2], [x23]\n"
+    "ld1 { v11.h }[2], [x22]\n"
+    "ld1 { v12.h }[2], [x21]\n"
+    "ld1 { v10.h }[2], [x20]\n"
+    "ld1 { v14.h }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+    "ldr h5, [x28, #0x0]\n"
+    "ldr h6, [x27, #0x0]\n"
+    "ldr h7, [x26, #0x0]\n"
+    "ldr h8, [x25, #0x0]\n"
+    "ldr h9, [x24, #0x0]\n"
+    "ldr h13, [x23, #0x0]\n"
+    "ldr h11, [x22, #0x0]\n"
+    "ldr h12, [x21, #0x0]\n"
+    "ldr h10, [x20, #0x0]\n"
+    "ldr h14, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "add x19, x8, x12\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr s5, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.h }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr h5, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "add x19, x8, x11\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v5.8h\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr s6, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v6.h }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr h6, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "add x19, x5, x10\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+    "fmla v30.8h, v4.8h, v9.8h\n"
+    "ldr h0, [x3, #0xc]\n"
+    "add x19, x17, x4\n"
+    "fmla v29.8h, v4.8h, v6.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v31.8h, v0.8h, v7.8h\n"
+    "fmla v30.8h, v0.8h, v8.8h\n"
+    "fmla v29.8h, v0.8h, v14.8h\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr h1, [x3, #0xe]\n"
+    "add x19, x17, x13\n"
+    "fmla v31.8h, v1.8h, v8.8h\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr h2, [x3, #0x10]\n"
+    "add x19, x17, x12\n"
+    "fmla v31.8h, v2.8h, v13.8h\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "ldr h3, [x3, #0x12]\n"
+    "add x19, x17, x11\n"
+    "fmla v31.8h, v3.8h, v5.8h\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "ldr h4, [x3, #0x14]\n"
+    "add x19, x17, x10\n"
+    "fmla v31.8h, v4.8h, v6.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr s8, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v8.h }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+    "ldr h8, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "ldr h0, [x3, #0x16]\n"
+    "add x19, x16, XZR\n"
+    "fmla v31.8h, v0.8h, v14.8h\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr s5, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v5.h }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr h5, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v29.8h, v0.8h, v5.8h\n"
+    "add x19, x16, x4\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr s6, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v6.h }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr h6, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v28.8h, v0.8h, v6.8h\n"
+    "ldr h1, [x3, #0x18]\n"
+    "add x19, x16, x13\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v6.8h\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v28.8h, v1.8h, v10.8h\n"
+    "ldr h2, [x3, #0x1a]\n"
+    "add x19, x16, x12\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr h3, [x3, #0x1c]\n"
+    "add x19, x16, x11\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr h4, [x3, #0x1e]\n"
+    "add x19, x16, x10\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr s14, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v14.h }[2], [x19]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+    "ldr h14, [x19, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+    "fmla v28.8h, v4.8h, v14.8h\n"
+    "ldr h0, [x3, #0x20]\n"
+    "add x19, x15, XZR\n"
+    "fmla v31.8h, v0.8h, v5.8h\n"
+    "fmla v30.8h, v0.8h, v6.8h\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v29.8h, v0.8h, v9.8h\n"
+    "add x19, x15, x4\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr s13, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v13.h }[2], [x19]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr h13, [x19, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "ldr h1, [x3, #0x22]\n"
+    "add x19, x15, x13\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr s5, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v5.h }[2], [x19]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr h5, [x19, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v28.8h, v1.8h, v5.8h\n"
+    "ldr h2, [x3, #0x24]\n"
+    "add x19, x15, x12\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr s6, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v6.h }[2], [x19]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr h6, [x19, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v6.8h\n"
+    "ldr h3, [x3, #0x26]\n"
+    "add x19, x15, x11\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr s8, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v8.h }[2], [x19]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr h8, [x19, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v8.8h\n"
+    "ldr h4, [x3, #0x28]\n"
+    "add x19, x15, x10\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "fmla v30.8h, v4.8h, v14.8h\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr s10, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v10.h }[2], [x19]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+    "ldr h10, [x19, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr h0, [x3, #0x2a]\n"
+    "add x19, x14, XZR\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "add x19, x14, x4\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 50f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 50f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "ldr h1, [x3, #0x2c]\n"
+    "add x19, x14, x13\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v1.8h, v5.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+    "fmla v28.8h, v1.8h, v9.8h\n"
+    "ldr h2, [x3, #0x2e]\n"
+    "add x19, x14, x12\n"
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "fmla v30.8h, v2.8h, v6.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ldr s11, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 54f\n"
+    "ld1 { v11.h }[2], [x19]\n"
+    "b 54f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+    "ldr h11, [x19, #0x0]\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr h3, [x3, #0x30]\n"
+    "add x19, x14, x11\n"
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "fmla v30.8h, v3.8h, v8.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr s12, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.h }[2], [x19]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+    "ldr h12, [x19, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr h4, [x3, #0x32]\n"
+    "add x19, x14, x10\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ldr s9, [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 58f\n"
+    "ld1 { v9.h }[2], [x19]\n"
+    "b 58f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+    "ldr h9, [x19, #0x0]\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+    "fmla v28.8h, v4.8h, v9.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "tbz %x[n_channels], #1, 59f\n"
+    "mov x19, x7\n"
+    "st1 { v31.s }[0], [x19], x6\n"
+    "add x7, x7, #0x4\n"
+    "st1 { v30.s }[0], [x19]\n"
+    "mov x19, x9\n"
+    "st1 { v29.s }[0], [x19], x6\n"
+    "add x9, x9, #0x4\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "mov x20, x7\n"
+    "st1 { v31.h }[2], [x20], x6\n"
+    "mov x19, x9\n"
+    "st1 { v30.h }[2], [x20]\n"
+    "st1 { v29.h }[2], [x19], x6\n"
+    "st1 { v28.h }[2], [x19]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x20, x7\n"
+    "st1 { v31.h }[0], [x20], x6\n"
+    "mov x19, x9\n"
+    "st1 { v30.h }[0], [x20]\n"
+    "st1 { v29.h }[0], [x19], x6\n"
+    "st1 { v28.h }[0], [x19]\n"
+    "60:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "61:"  // Tile loop: End
+    "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x28, #0x1\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x27, x27, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x27, x19\n"
+    "csel x27, x27, XZR, LT\n"
+    "csel x28, x28, x21, LT\n"
+    "cmp x28, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..96e1ae4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v18.8h }, [x20]\n"
+    "ld1r { v17.8h }, [x19]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x21, #0x0]\n"
+    "mov x11, #0x10\n" // cntb _, ALL, #1
+    "ldp x10, x9, [x21, #0x10]\n"
+    "sub x28, XZR, x11\n"
+    "lsr x27, %x[n_channels], #0x3\n"
+    "cbz x27, 3f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x11, x27, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x15, x15, #0x60\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "ldr q5, [x26, x14]\n"
+    "ldr q6, [x25, x14]\n"
+    "ldr q7, [x24, x14]\n"
+    "ldr q8, [x23, x14]\n"
+    "ldr q9, [x22, x14]\n"
+    "ldr q13, [x21, x14]\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "ldp x26, x25, [x16, #0x40]\n"
+    "ldr q11, [x20, x14]\n"
+    "ldr q12, [x19, x14]\n"
+    "ldr q10, [x26, x14]\n"
+    "ldr q14, [x25, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+    "ldr x22, [x16, #0x60]\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr q0, [x15, #0x0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q6, [x23, x14]\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v8.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "ldr q1, [x15, #0x10]\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x22, x14]\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "ldr q2, [x15, #0x20]\n"
+    "ldr x26, [x16, #0x80]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x21, x14]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v5.8h\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "ldr q3, [x15, #0x30]\n"
+    "ldr x24, [x16, #0x90]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x20, x14]\n"
+    "fmla v30.8h, v4.8h, v9.8h\n"
+    "fmla v29.8h, v4.8h, v6.8h\n"
+    "ldr q9, [x19, x14]\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x15, #0x40]\n"
+    "ldr x23, [x16, #0x98]\n"
+    "fmla v31.8h, v0.8h, v7.8h\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v30.8h, v0.8h, v8.8h\n"
+    "fmla v29.8h, v0.8h, v14.8h\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr q0, [x15, #0x50]\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v31.8h, v1.8h, v8.8h\n"
+    "ldr q8, [x25, x14]\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q1, [x15, #0x60]\n"
+    "ldr x25, [x16, #0xc8]\n"
+    "fmla v31.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x26, x14]\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "ldr q2, [x15, #0x70]\n"
+    "ldr q16, [x15, #0x140]\n"
+    "fmla v31.8h, v3.8h, v5.8h\n"
+    "ldr q5, [x24, x14]\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "ldr x24, [x16, #0xd0]\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "ldr q3, [x15, #0x80]\n"
+    "fmla v31.8h, v4.8h, v6.8h\n"
+    "ldr q6, [x23, x14]\n"
+    "ldr x23, [x16, #0xd8]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "ldr q10, [x22, x14]\n"
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "ldr q4, [x15, #0x90]\n"
+    "ldr x22, [x16, #0xe0]\n"
+    "fmla v31.8h, v0.8h, v14.8h\n"
+    "ldr q14, [x19, x14]\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v5.8h\n"
+    "ldr x19, [x16, #0xf8]\n"
+    "fmla v28.8h, v0.8h, v6.8h\n"
+    "ldr q0, [x15, #0xa0]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr x21, [x16, #0xe8]\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v10.8h\n"
+    "ldr q1, [x15, #0xb0]\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x20, x14]\n"
+    "ldr x20, [x16, #0xf0]\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x15, #0xc0]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr x26, [x16, #0x100]\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr q3, [x15, #0xd0]\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x25, x14]\n"
+    "ldr x25, [x16, #0x108]\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "ldr q8, [x22, x14]\n"
+    "fmla v28.8h, v4.8h, v14.8h\n"
+    "ldr q4, [x15, #0xe0]\n"
+    "fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr x24, [x16, #0x110]\n"
+    "fmla v30.8h, v0.8h, v6.8h\n"
+    "fmla v29.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "ldr q0, [x15, #0xf0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q6, [x23, x14]\n"
+    "ldr x23, [x16, #0x118]\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v5.8h\n"
+    "ldr q1, [x15, #0x100]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "ldr q10, [x21, x14]\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "fmla v28.8h, v2.8h, v6.8h\n"
+    "ldr q2, [x15, #0x110]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "fmla v28.8h, v3.8h, v8.8h\n"
+    "ldr q3, [x15, #0x120]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x19, x14]\n"
+    "fmla v30.8h, v4.8h, v14.8h\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x15, #0x130]\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x26, x14]\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldr q0, [x15, #0x150]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v1.8h, v5.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v28.8h, v1.8h, v9.8h\n"
+    "ldr q1, [x15, #0x160]\n"
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "ldr q5, [x26, x11]\n"
+    "fmla v30.8h, v2.8h, v6.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x23, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "ldr q6, [x25, x11]\n"
+    "fmla v30.8h, v3.8h, v8.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "ldr q7, [x24, x11]\n"
+    "ldr q13, [x21, x11]\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "ldr q8, [x23, x11]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "ldr q11, [x20, x11]\n"
+    "ldr q12, [x19, x11]\n"
+    "fmla v28.8h, v4.8h, v9.8h\n"
+    "ldr q9, [x22, x11]\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "ldp x26, x25, [x16, #0x40]\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "ldr q2, [x15, #0x170]\n"
+    "ldr q3, [x15, #0x180]\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "ldr q10, [x26, x11]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "ldr q14, [x25, x11]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "str q31, [x13, x28]\n"
+    "cmp x11, x27, LSL #4\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "str q30, [x12, x28]\n"
+    "ldr q4, [x15, #0x190]\n"
+    "add x15, x15, #0x1a0\n"
+    "str q29, [x10, x28]\n"
+    "str q28, [x9, x28]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+    "ldr x22, [x16, #0x60]\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr q0, [x15, #0x0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q6, [x23, x14]\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v8.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "ldr q1, [x15, #0x10]\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x22, x14]\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "ldr q2, [x15, #0x20]\n"
+    "ldr x26, [x16, #0x80]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x21, x14]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v5.8h\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "ldr q3, [x15, #0x30]\n"
+    "ldr x24, [x16, #0x90]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x20, x14]\n"
+    "fmla v30.8h, v4.8h, v9.8h\n"
+    "fmla v29.8h, v4.8h, v6.8h\n"
+    "ldr q9, [x19, x14]\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x15, #0x40]\n"
+    "ldr x23, [x16, #0x98]\n"
+    "fmla v31.8h, v0.8h, v7.8h\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v30.8h, v0.8h, v8.8h\n"
+    "fmla v29.8h, v0.8h, v14.8h\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr q0, [x15, #0x50]\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v31.8h, v1.8h, v8.8h\n"
+    "ldr q8, [x25, x14]\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr q1, [x15, #0x60]\n"
+    "ldr x25, [x16, #0xc8]\n"
+    "fmla v31.8h, v2.8h, v13.8h\n"
+    "ldr q13, [x26, x14]\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "ldr q2, [x15, #0x70]\n"
+    "fmla v31.8h, v3.8h, v5.8h\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr x24, [x16, #0xd0]\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "ldr q3, [x15, #0x80]\n"
+    "fmla v31.8h, v4.8h, v6.8h\n"
+    "ldr q6, [x23, x14]\n"
+    "ldr x23, [x16, #0xd8]\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "ldr q10, [x22, x14]\n"
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "ldr q4, [x15, #0x90]\n"
+    "ldr x22, [x16, #0xe0]\n"
+    "fmla v31.8h, v0.8h, v14.8h\n"
+    "ldr q14, [x19, x14]\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "fmla v29.8h, v0.8h, v5.8h\n"
+    "ldr x19, [x16, #0xf8]\n"
+    "fmla v28.8h, v0.8h, v6.8h\n"
+    "ldr q0, [x15, #0xa0]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr x21, [x16, #0xe8]\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v6.8h\n"
+    "fmla v28.8h, v1.8h, v10.8h\n"
+    "ldr q1, [x15, #0xb0]\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "ldr q12, [x20, x14]\n"
+    "ldr x20, [x16, #0xf0]\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr q2, [x15, #0xc0]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr x26, [x16, #0x100]\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr q3, [x15, #0xd0]\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "ldr q13, [x25, x14]\n"
+    "ldr x25, [x16, #0x108]\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "ldr q8, [x22, x14]\n"
+    "fmla v28.8h, v4.8h, v14.8h\n"
+    "ldr q4, [x15, #0xe0]\n"
+    "fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr x24, [x16, #0x110]\n"
+    "fmla v30.8h, v0.8h, v6.8h\n"
+    "fmla v29.8h, v0.8h, v9.8h\n"
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "ldr q0, [x15, #0xf0]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr q6, [x23, x14]\n"
+    "ldr x23, [x16, #0x118]\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "fmla v28.8h, v1.8h, v5.8h\n"
+    "ldr q1, [x15, #0x100]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "ldr q10, [x21, x14]\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "fmla v28.8h, v2.8h, v6.8h\n"
+    "ldr q2, [x15, #0x110]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "fmla v28.8h, v3.8h, v8.8h\n"
+    "ldr q3, [x15, #0x120]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr q12, [x19, x14]\n"
+    "fmla v30.8h, v4.8h, v14.8h\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr q4, [x15, #0x130]\n"
+    "add x15, x15, #0x140\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "ldr q9, [x26, x14]\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "fmla v30.8h, v1.8h, v5.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v28.8h, v1.8h, v9.8h\n"
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "fmla v30.8h, v2.8h, v6.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "ldr q9, [x23, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "fmla v30.8h, v3.8h, v8.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "fmla v28.8h, v4.8h, v9.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "str q31, [x13, x28]\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "str q30, [x12, x28]\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "str q29, [x10, x28]\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "str q28, [x9, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x1\n"
+    "beq 60f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x28, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "add x13, x13, x28\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x12, x12, x28\n"
+    "ldr q3, [x15, #0x40]\n"
+    "add x10, x10, x28\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x9, x9, x28\n"
+    "ldr x24, [x16, #0x10]\n"
+    "ldr x23, [x16, #0x18]\n"
+    "ldr x22, [x16, #0x20]\n"
+    "add x24, x24, x14\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x23, x23, x14\n"
+    "ldr x20, [x16, #0x30]\n"
+    "add x22, x22, x14\n"
+    "ldr x19, [x16, #0x38]\n"
+    "add x21, x21, x14\n"
+    "ldr x26, [x16, #0x40]\n"
+    "add x20, x20, x14\n"
+    "ldr x25, [x16, #0x48]\n"
+    "add x19, x19, x14\n"
+    "add x26, x26, x14\n"
+    "add x25, x25, x14\n"
+    "add x15, x15, #0x60\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v5.s }[0], [x26], #0x4\n"
+    "ld1 { v6.s }[0], [x25], #0x4\n"
+    "ld1 { v7.s }[0], [x24], #0x4\n"
+    "ld1 { v8.s }[0], [x23], #0x4\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v13.s }[0], [x21], #0x4\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "ld1 { v12.s }[0], [x19], #0x4\n"
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "ld1 { v14.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v7.h }[2], [x24], #0x2\n"
+    "ld1 { v8.h }[2], [x23], #0x2\n"
+    "ld1 { v5.h }[2], [x26], #0x2\n"
+    "ld1 { v6.h }[2], [x25], #0x2\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "ld1 { v13.h }[2], [x21], #0x2\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "ld1 { v12.h }[2], [x19], #0x2\n"
+    "ld1 { v10.h }[2], [x26], #0x2\n"
+    "ld1 { v14.h }[2], [x25], #0x2\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+    "ld1 { v5.h }[0], [x26], #0x2\n"
+    "ld1 { v6.h }[0], [x25], #0x2\n"
+    "ld1 { v7.h }[0], [x24], #0x2\n"
+    "ld1 { v8.h }[0], [x23], #0x2\n"
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "ld1 { v13.h }[0], [x21], #0x2\n"
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "ld1 { v12.h }[0], [x19], #0x2\n"
+    "ld1 { v10.h }[0], [x26], #0x2\n"
+    "ld1 { v14.h }[0], [x25], #0x2\n"
+    "5:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr x24, [x16, #0x50]\n"
+    "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+    "add x24, x24, x14\n"
+    "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+    "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "fmla v30.8h, v1.8h, v9.8h\n"
+    "fmla v29.8h, v1.8h, v8.8h\n"
+    "fmla v28.8h, v1.8h, v13.8h\n"
+    "fmla v31.8h, v2.8h, v9.8h\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v5.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.h }[2], [x24], #0x2\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v5.h }[0], [x24], #0x2\n"
+    "7:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v5.8h\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "add x23, x23, x14\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v5.8h\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v6.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v6.h }[2], [x23], #0x2\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v6.h }[0], [x23], #0x2\n"
+    "9:"  // Oddments: Load input (1, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v6.8h\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.h }[2], [x22], #0x2\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 5): Bit 1: Unset
+    "ld1 { v9.h }[0], [x22], #0x2\n"
+    "11:"  // Oddments: Load input (0, 5): Bit 1: End
+    "fmla v30.8h, v4.8h, v9.8h\n"
+    "ldr h0, [x15, #0xc]\n"
+    "fmla v29.8h, v4.8h, v6.8h\n"
+    "ldr x21, [x16, #0x68]\n"
+    "add x21, x21, x14\n"
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "fmla v31.8h, v0.8h, v7.8h\n"
+    "fmla v30.8h, v0.8h, v8.8h\n"
+    "fmla v29.8h, v0.8h, v14.8h\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v11.h }[2], [x21], #0x2\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v11.h }[0], [x21], #0x2\n"
+    "13:"  // Oddments: Load input (2, 1): Bit 1: End
+    "fmla v28.8h, v0.8h, v11.8h\n"
+    "ldr h1, [x15, #0xe]\n"
+    "fmla v31.8h, v1.8h, v8.8h\n"
+    "ldr x20, [x16, #0x70]\n"
+    "add x20, x20, x14\n"
+    "fmla v30.8h, v1.8h, v13.8h\n"
+    "fmla v29.8h, v1.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 2): Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "15:"  // Oddments: Load input (2, 2): Bit 1: End
+    "fmla v28.8h, v1.8h, v12.8h\n"
+    "ldr h2, [x15, #0x10]\n"
+    "fmla v31.8h, v2.8h, v13.8h\n"
+    "ldr x19, [x16, #0x78]\n"
+    "add x19, x19, x14\n"
+    "fmla v30.8h, v2.8h, v5.8h\n"
+    "fmla v29.8h, v2.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v9.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v9.h }[2], [x19], #0x2\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v9.h }[0], [x19], #0x2\n"
+    "17:"  // Oddments: Load input (2, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v9.8h\n"
+    "ldr h3, [x15, #0x12]\n"
+    "fmla v31.8h, v3.8h, v5.8h\n"
+    "ldr x26, [x16, #0x80]\n"
+    "add x26, x26, x14\n"
+    "fmla v30.8h, v3.8h, v6.8h\n"
+    "fmla v29.8h, v3.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.h }[2], [x26], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v13.h }[0], [x26], #0x2\n"
+    "19:"  // Oddments: Load input (2, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v13.8h\n"
+    "ldr h4, [x15, #0x14]\n"
+    "fmla v31.8h, v4.8h, v6.8h\n"
+    "ldr x25, [x16, #0x88]\n"
+    "add x25, x25, x14\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v8.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v8.h }[2], [x25], #0x2\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (2, 5): Bit 1: Unset
+    "ld1 { v8.h }[0], [x25], #0x2\n"
+    "21:"  // Oddments: Load input (2, 5): Bit 1: End
+    "fmla v28.8h, v4.8h, v8.8h\n"
+    "ldr h0, [x15, #0x16]\n"
+    "fmla v31.8h, v0.8h, v14.8h\n"
+    "ldr x24, [x16, #0x90]\n"
+    "add x24, x24, x14\n"
+    "fmla v30.8h, v0.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v5.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v5.h }[2], [x24], #0x2\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v5.h }[0], [x24], #0x2\n"
+    "23:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v29.8h, v0.8h, v5.8h\n"
+    "ldr x23, [x16, #0x98]\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v6.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v6.h }[2], [x23], #0x2\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v6.h }[0], [x23], #0x2\n"
+    "25:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v28.8h, v0.8h, v6.8h\n"
+    "ldr h1, [x15, #0x18]\n"
+    "fmla v31.8h, v1.8h, v11.8h\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "add x22, x22, x14\n"
+    "fmla v30.8h, v1.8h, v12.8h\n"
+    "fmla v29.8h, v1.8h, v6.8h\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v10.s }[0], [x22], #0x4\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.h }[2], [x22], #0x2\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v10.h }[0], [x22], #0x2\n"
+    "27:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v28.8h, v1.8h, v10.8h\n"
+    "ldr h2, [x15, #0x1a]\n"
+    "fmla v31.8h, v2.8h, v12.8h\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "add x21, x21, x14\n"
+    "fmla v30.8h, v2.8h, v9.8h\n"
+    "fmla v29.8h, v2.8h, v10.8h\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v11.h }[2], [x21], #0x2\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v11.h }[0], [x21], #0x2\n"
+    "29:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr h3, [x15, #0x1c]\n"
+    "fmla v31.8h, v3.8h, v9.8h\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "add x20, x20, x14\n"
+    "fmla v30.8h, v3.8h, v13.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v12.h }[2], [x20], #0x2\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v12.h }[0], [x20], #0x2\n"
+    "31:"  // Oddments: Load input (3, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr h4, [x15, #0x1e]\n"
+    "fmla v31.8h, v4.8h, v13.8h\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "add x19, x19, x14\n"
+    "fmla v30.8h, v4.8h, v8.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v14.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v14.h }[2], [x19], #0x2\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (3, 5): Bit 1: Unset
+    "ld1 { v14.h }[0], [x19], #0x2\n"
+    "33:"  // Oddments: Load input (3, 5): Bit 1: End
+    "fmla v28.8h, v4.8h, v14.8h\n"
+    "ldr h0, [x15, #0x20]\n"
+    "fmla v31.8h, v0.8h, v5.8h\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "add x26, x26, x14\n"
+    "fmla v30.8h, v0.8h, v6.8h\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.h }[2], [x26], #0x2\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v9.h }[0], [x26], #0x2\n"
+    "35:"  // Oddments: Load input (4, 0): Bit 1: End
+    "fmla v29.8h, v0.8h, v9.8h\n"
+    "ldr x25, [x16, #0xc8]\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v13.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v13.h }[2], [x25], #0x2\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.h }[0], [x25], #0x2\n"
+    "37:"  // Oddments: Load input (4, 1): Bit 1: End
+    "fmla v28.8h, v0.8h, v13.8h\n"
+    "ldr h1, [x15, #0x22]\n"
+    "fmla v31.8h, v1.8h, v6.8h\n"
+    "ldr x24, [x16, #0xd0]\n"
+    "add x24, x24, x14\n"
+    "fmla v30.8h, v1.8h, v10.8h\n"
+    "fmla v29.8h, v1.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v5.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v5.h }[2], [x24], #0x2\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v5.h }[0], [x24], #0x2\n"
+    "39:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v28.8h, v1.8h, v5.8h\n"
+    "ldr h2, [x15, #0x24]\n"
+    "fmla v31.8h, v2.8h, v10.8h\n"
+    "ldr x23, [x16, #0xd8]\n"
+    "add x23, x23, x14\n"
+    "fmla v30.8h, v2.8h, v11.8h\n"
+    "fmla v29.8h, v2.8h, v5.8h\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v6.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v6.h }[2], [x23], #0x2\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v6.h }[0], [x23], #0x2\n"
+    "41:"  // Oddments: Load input (4, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v6.8h\n"
+    "ldr h3, [x15, #0x26]\n"
+    "fmla v31.8h, v3.8h, v11.8h\n"
+    "ldr x22, [x16, #0xe0]\n"
+    "add x22, x22, x14\n"
+    "fmla v30.8h, v3.8h, v12.8h\n"
+    "fmla v29.8h, v3.8h, v6.8h\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v8.s }[0], [x22], #0x4\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v8.h }[2], [x22], #0x2\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v8.h }[0], [x22], #0x2\n"
+    "43:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v8.8h\n"
+    "ldr h4, [x15, #0x28]\n"
+    "fmla v31.8h, v4.8h, v12.8h\n"
+    "ldr x21, [x16, #0xe8]\n"
+    "add x21, x21, x14\n"
+    "fmla v30.8h, v4.8h, v14.8h\n"
+    "fmla v29.8h, v4.8h, v8.8h\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.s }[0], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v10.h }[2], [x21], #0x2\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (4, 5): Bit 1: Unset
+    "ld1 { v10.h }[0], [x21], #0x2\n"
+    "45:"  // Oddments: Load input (4, 5): Bit 1: End
+    "fmla v28.8h, v4.8h, v10.8h\n"
+    "ldr h0, [x15, #0x2a]\n"
+    "fmla v31.8h, v0.8h, v9.8h\n"
+    "ldr x20, [x16, #0xf0]\n"
+    "add x20, x20, x14\n"
+    "fmla v30.8h, v0.8h, v13.8h\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.h }[2], [x20], #0x2\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (5, 0): Bit 1: Unset
+    "ld1 { v11.h }[0], [x20], #0x2\n"
+    "47:"  // Oddments: Load input (5, 0): Bit 1: End
+    "fmla v29.8h, v0.8h, v11.8h\n"
+    "ldr x19, [x16, #0xf8]\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 49f\n"
+    "ld1 { v12.h }[2], [x19], #0x2\n"
+    "b 49f\n"
+    "48:"  // Oddments: Load input (5, 1): Bit 1: Unset
+    "ld1 { v12.h }[0], [x19], #0x2\n"
+    "49:"  // Oddments: Load input (5, 1): Bit 1: End
+    "fmla v28.8h, v0.8h, v12.8h\n"
+    "ldr h1, [x15, #0x2c]\n"
+    "fmla v31.8h, v1.8h, v13.8h\n"
+    "ldr x26, [x16, #0x100]\n"
+    "add x26, x26, x14\n"
+    "fmla v30.8h, v1.8h, v5.8h\n"
+    "fmla v29.8h, v1.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v9.h }[2], [x26], #0x2\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (5, 2): Bit 1: Unset
+    "ld1 { v9.h }[0], [x26], #0x2\n"
+    "51:"  // Oddments: Load input (5, 2): Bit 1: End
+    "fmla v28.8h, v1.8h, v9.8h\n"
+    "ldr h2, [x15, #0x2e]\n"
+    "fmla v31.8h, v2.8h, v5.8h\n"
+    "ldr x25, [x16, #0x108]\n"
+    "add x25, x25, x14\n"
+    "fmla v30.8h, v2.8h, v6.8h\n"
+    "fmla v29.8h, v2.8h, v9.8h\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "tbz %x[n_channels], #0, 53f\n"
+    "ld1 { v11.h }[2], [x25], #0x2\n"
+    "b 53f\n"
+    "52:"  // Oddments: Load input (5, 3): Bit 1: Unset
+    "ld1 { v11.h }[0], [x25], #0x2\n"
+    "53:"  // Oddments: Load input (5, 3): Bit 1: End
+    "fmla v28.8h, v2.8h, v11.8h\n"
+    "ldr h3, [x15, #0x30]\n"
+    "fmla v31.8h, v3.8h, v6.8h\n"
+    "ldr x24, [x16, #0x110]\n"
+    "add x24, x24, x14\n"
+    "fmla v30.8h, v3.8h, v8.8h\n"
+    "fmla v29.8h, v3.8h, v11.8h\n"
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.h }[2], [x24], #0x2\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (5, 4): Bit 1: Unset
+    "ld1 { v12.h }[0], [x24], #0x2\n"
+    "55:"  // Oddments: Load input (5, 4): Bit 1: End
+    "fmla v28.8h, v3.8h, v12.8h\n"
+    "ldr h4, [x15, #0x32]\n"
+    "fmla v31.8h, v4.8h, v8.8h\n"
+    "ldr x23, [x16, #0x118]\n"
+    "add x23, x23, x14\n"
+    "fmla v30.8h, v4.8h, v10.8h\n"
+    "fmla v29.8h, v4.8h, v12.8h\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v9.s }[0], [x23], #0x4\n"
+    "tbz %x[n_channels], #0, 57f\n"
+    "ld1 { v9.h }[2], [x23], #0x2\n"
+    "b 57f\n"
+    "56:"  // Oddments: Load input (5, 5): Bit 1: Unset
+    "ld1 { v9.h }[0], [x23], #0x2\n"
+    "57:"  // Oddments: Load input (5, 5): Bit 1: End
+    "fmla v28.8h, v4.8h, v9.8h\n"
+    "fmax v31.8h, v31.8h, v18.8h\n"
+    "fmax v30.8h, v30.8h, v18.8h\n"
+    "fmax v29.8h, v29.8h, v18.8h\n"
+    "fmin v31.8h, v31.8h, v17.8h\n"
+    "fmin v30.8h, v30.8h, v17.8h\n"
+    "fmin v29.8h, v29.8h, v17.8h\n"
+    "fmax v28.8h, v28.8h, v18.8h\n"
+    "fmin v28.8h, v28.8h, v17.8h\n"
+    "tbz %x[n_channels], #1, 58f\n"
+    "st1 { v31.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x10], #0x4\n"
+    "st1 { v28.s }[0], [x9], #0x4\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "st1 { v31.h }[2], [x13], #0x2\n"
+    "st1 { v30.h }[2], [x12], #0x2\n"
+    "st1 { v29.h }[2], [x10], #0x2\n"
+    "st1 { v28.h }[2], [x9], #0x2\n"
+    "b 59f\n"
+    "58:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v31.h }[0], [x13], #0x2\n"
+    "st1 { v30.h }[0], [x12], #0x2\n"
+    "st1 { v29.h }[0], [x10], #0x2\n"
+    "st1 { v28.h }[0], [x9], #0x2\n"
+    "59:"  // Oddments: Store: Bit 1: End
+
+    "60:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000..3468b70
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_generic_output9_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int n_output_points = 9;
+
+  kern_type kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl;
+
+  a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..8ac79f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp

@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v4.8h }, [%x[minmax_vals]]\n"
+    "add x19, %x[minmax_vals], #0x2\n"
+    "mov x11, #0x0\n"
+    "ld1r { v3.8h }, [x19]\n"
+    "lsr x10, %x[n_channels], #0x3\n"
+    "cbz x10, 5f\n"
+    "1:"  // Channel loop
+    "movi v25.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q25, [%x[bias], x11]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "mov v24.16b, v25.16b\n"
+    "ldr q23, [%x[params], #0x0]\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v22.16b, v25.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, %x[n_points], #0x1\n"
+    "mov v21.16b, v25.16b\n"
+    "ldr q2, [x9, x11]\n"
+    "mov v20.16b, v25.16b\n"
+    "add %x[params], %x[params], #0x10\n"
+    "mov v19.16b, v25.16b\n"
+    "ldr q1, [x28, x11]\n"
+    "mov v18.16b, v25.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v17.16b, v25.16b\n"
+    "ldr q0, [x27, x11]\n"
+    "mov v16.16b, v25.16b\n"
+    "ldr q31, [x26, x11]\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "ldr q30, [x25, x11]\n"
+    "ldr q29, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "ldr q28, [x23, x11]\n"
+    "ldr q27, [x22, x11]\n"
+    "ldr x21, [x20], #0x8\n"
+    "ldr q26, [x21, x11]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "fmla v25.8h, v2.8h, v23.8h\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, x19, #0x1\n"
+    "fmla v24.8h, v1.8h, v23.8h\n"
+    "ldr q2, [x9, x11]\n"
+    "fmla v22.8h, v0.8h, v23.8h\n"
+    "fmla v21.8h, v31.8h, v23.8h\n"
+    "ldr q1, [x28, x11]\n"
+    "fmla v20.8h, v30.8h, v23.8h\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "fmla v19.8h, v29.8h, v23.8h\n"
+    "fmla v18.8h, v28.8h, v23.8h\n"
+    "ldr q0, [x27, x11]\n"
+    "fmla v17.8h, v27.8h, v23.8h\n"
+    "fmla v16.8h, v26.8h, v23.8h\n"
+    "ldr q23, [%x[params], #0x0]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q31, [x26, x11]\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "ldr q30, [x25, x11]\n"
+    "ldr q29, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "ldr q28, [x23, x11]\n"
+    "ldr q27, [x22, x11]\n"
+    "ldr x21, [x20], #0x8\n"
+    "ldr q26, [x21, x11]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla v25.8h, v2.8h, v23.8h\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "fmla v24.8h, v1.8h, v23.8h\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "fmla v22.8h, v0.8h, v23.8h\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "fmla v21.8h, v31.8h, v23.8h\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "fmla v20.8h, v30.8h, v23.8h\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmla v19.8h, v29.8h, v23.8h\n"
+    "fmla v18.8h, v28.8h, v23.8h\n"
+    "fmla v17.8h, v27.8h, v23.8h\n"
+    "fmla v16.8h, v26.8h, v23.8h\n"
+    "fmax v25.8h, v25.8h, v4.8h\n"
+    "fmax v24.8h, v24.8h, v4.8h\n"
+    "fmax v22.8h, v22.8h, v4.8h\n"
+    "fmin v25.8h, v25.8h, v3.8h\n"
+    "str q25, [x27, x11]\n"
+    "fmin v24.8h, v24.8h, v3.8h\n"
+    "fmin v22.8h, v22.8h, v3.8h\n"
+    "str q24, [x26, x11]\n"
+    "fmax v21.8h, v21.8h, v4.8h\n"
+    "fmax v20.8h, v20.8h, v4.8h\n"
+    "str q22, [x25, x11]\n"
+    "fmax v19.8h, v19.8h, v4.8h\n"
+    "fmax v18.8h, v18.8h, v4.8h\n"
+    "fmin v21.8h, v21.8h, v3.8h\n"
+    "str q21, [x24, x11]\n"
+    "fmin v20.8h, v20.8h, v3.8h\n"
+    "fmin v19.8h, v19.8h, v3.8h\n"
+    "str q20, [x23, x11]\n"
+    "fmin v18.8h, v18.8h, v3.8h\n"
+    "fmax v17.8h, v17.8h, v4.8h\n"
+    "str q19, [x22, x11]\n"
+    "fmax v16.8h, v16.8h, v4.8h\n"
+    "str q18, [x21, x11]\n"
+    "fmin v17.8h, v17.8h, v3.8h\n"
+    "fmin v16.8h, v16.8h, v3.8h\n"
+    "str q17, [x20, x11]\n"
+    "str q16, [x19, x11]\n"
+    "add x11, x11, #0x10\n"
+    "cmp x11, x10, LSL #4\n"
+    "blt 1b\n"
+    "5:"  // Oddments
+    "tst %x[n_channels], #0x7\n"
+    "beq 25f\n"
+    "movi v25.16b, #0x0\n"
+    "cbz %x[bias], 10f\n"
+    "add x19, %x[bias], x11\n"
+    "tbz %x[n_channels], #2, 7f\n"
+    "ld1 { v25.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v25.s }[2], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v25.h }[6], [x19], #0x2\n"
+    "b 9f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v25.h }[4], [x19], #0x2\n"
+    "b 9f\n"
+    "7:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v25.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v25.h }[2], [x19], #0x2\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v25.h }[0], [x19], #0x2\n"
+    "9:"  // Oddments: Load bias: Bit 2: End
+
+    "10:"  // Oddments: Load bias: Done
+    "mov v24.16b, v25.16b\n"
+    "ldr q23, [%x[params], #0x0]\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v22.16b, v25.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add %x[params], %x[params], #0x10\n"
+    "mov v21.16b, v25.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v20.16b, v25.16b\n"
+    "add x9, x9, x11\n"
+    "mov v19.16b, v25.16b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "mov v18.16b, v25.16b\n"
+    "add x28, x28, x11\n"
+    "mov v17.16b, v25.16b\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "mov v16.16b, v25.16b\n"
+    "add x27, x27, x11\n"
+    "ldr x21, [x20], #0x8\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #2, 12f\n"
+    "ldr d2, [x9], #0x8\n"
+    "ldr d1, [x28], #0x8\n"
+    "ldr d0, [x27], #0x8\n"
+    "ldr d31, [x26], #0x8\n"
+    "ldr d30, [x25], #0x8\n"
+    "ldr d29, [x24], #0x8\n"
+    "ldr d28, [x23], #0x8\n"
+    "ldr d27, [x22], #0x8\n"
+    "ldr d26, [x21], #0x8\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ld1 { v2.s }[2], [x9], #0x4\n"
+    "ld1 { v1.s }[2], [x28], #0x4\n"
+    "ld1 { v0.s }[2], [x27], #0x4\n"
+    "ld1 { v31.s }[2], [x26], #0x4\n"
+    "ld1 { v30.s }[2], [x25], #0x4\n"
+    "ld1 { v29.s }[2], [x24], #0x4\n"
+    "ld1 { v28.s }[2], [x23], #0x4\n"
+    "ld1 { v27.s }[2], [x22], #0x4\n"
+    "ld1 { v26.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v2.h }[6], [x9], #0x2\n"
+    "ld1 { v1.h }[6], [x28], #0x2\n"
+    "ld1 { v0.h }[6], [x27], #0x2\n"
+    "ld1 { v31.h }[6], [x26], #0x2\n"
+    "ld1 { v30.h }[6], [x25], #0x2\n"
+    "ld1 { v29.h }[6], [x24], #0x2\n"
+    "ld1 { v28.h }[6], [x23], #0x2\n"
+    "ld1 { v27.h }[6], [x22], #0x2\n"
+    "ld1 { v26.h }[6], [x21], #0x2\n"
+    "b 14f\n"
+    "11:"  // Oddments: Load: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v2.h }[4], [x9], #0x2\n"
+    "ld1 { v1.h }[4], [x28], #0x2\n"
+    "ld1 { v0.h }[4], [x27], #0x2\n"
+    "ld1 { v31.h }[4], [x26], #0x2\n"
+    "ld1 { v30.h }[4], [x25], #0x2\n"
+    "ld1 { v29.h }[4], [x24], #0x2\n"
+    "ld1 { v28.h }[4], [x23], #0x2\n"
+    "ld1 { v27.h }[4], [x22], #0x2\n"
+    "ld1 { v26.h }[4], [x21], #0x2\n"
+    "b 14f\n"
+    "12:"  // Oddments: Load: Bit 2: Unset
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr s2, [x9], #0x4\n"
+    "ldr s1, [x28], #0x4\n"
+    "ldr s0, [x27], #0x4\n"
+    "ldr s31, [x26], #0x4\n"
+    "ldr s30, [x25], #0x4\n"
+    "ldr s29, [x24], #0x4\n"
+    "ldr s28, [x23], #0x4\n"
+    "ldr s27, [x22], #0x4\n"
+    "ldr s26, [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v2.h }[2], [x9], #0x2\n"
+    "ld1 { v1.h }[2], [x28], #0x2\n"
+    "ld1 { v0.h }[2], [x27], #0x2\n"
+    "ld1 { v31.h }[2], [x26], #0x2\n"
+    "ld1 { v30.h }[2], [x25], #0x2\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "ld1 { v28.h }[2], [x23], #0x2\n"
+    "ld1 { v27.h }[2], [x22], #0x2\n"
+    "ld1 { v26.h }[2], [x21], #0x2\n"
+    "b 14f\n"
+    "13:"  // Oddments: Load: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 14f\n"
+    "ldr h2, [x9], #0x2\n"
+    "ldr h1, [x28], #0x2\n"
+    "ldr h0, [x27], #0x2\n"
+    "ldr h31, [x26], #0x2\n"
+    "ldr h30, [x25], #0x2\n"
+    "ldr h29, [x24], #0x2\n"
+    "ldr h28, [x23], #0x2\n"
+    "ldr h27, [x22], #0x2\n"
+    "ldr h26, [x21], #0x2\n"
+    "14:"  // Oddments: Load: Bit 2: End
+    "subs x19, %x[n_points], #0x1\n"
+    "ble 20f\n"
+    "15:"  // Oddments: Planar loop
+    "fmla v25.8h, v2.8h, v23.8h\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add x9, x9, x11\n"
+    "fmla v24.8h, v1.8h, v23.8h\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "fmla v22.8h, v0.8h, v23.8h\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "fmla v21.8h, v31.8h, v23.8h\n"
+    "add x28, x28, x11\n"
+    "fmla v20.8h, v30.8h, v23.8h\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "fmla v19.8h, v29.8h, v23.8h\n"
+    "add x27, x27, x11\n"
+    "fmla v18.8h, v28.8h, v23.8h\n"
+    "ldr x21, [x20], #0x8\n"
+    "fmla v17.8h, v27.8h, v23.8h\n"
+    "add x26, x26, x11\n"
+    "fmla v16.8h, v26.8h, v23.8h\n"
+    "ldr q23, [%x[params], #0x0]\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #2, 17f\n"
+    "ldr d2, [x9], #0x8\n"
+    "ldr d1, [x28], #0x8\n"
+    "ldr d0, [x27], #0x8\n"
+    "ldr d31, [x26], #0x8\n"
+    "ldr d30, [x25], #0x8\n"
+    "ldr d29, [x24], #0x8\n"
+    "ldr d28, [x23], #0x8\n"
+    "ldr d27, [x22], #0x8\n"
+    "ldr d26, [x21], #0x8\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v2.s }[2], [x9], #0x4\n"
+    "ld1 { v1.s }[2], [x28], #0x4\n"
+    "ld1 { v0.s }[2], [x27], #0x4\n"
+    "ld1 { v31.s }[2], [x26], #0x4\n"
+    "ld1 { v30.s }[2], [x25], #0x4\n"
+    "ld1 { v29.s }[2], [x24], #0x4\n"
+    "ld1 { v28.s }[2], [x23], #0x4\n"
+    "ld1 { v27.s }[2], [x22], #0x4\n"
+    "ld1 { v26.s }[2], [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v2.h }[6], [x9], #0x2\n"
+    "ld1 { v1.h }[6], [x28], #0x2\n"
+    "ld1 { v0.h }[6], [x27], #0x2\n"
+    "ld1 { v31.h }[6], [x26], #0x2\n"
+    "ld1 { v30.h }[6], [x25], #0x2\n"
+    "ld1 { v29.h }[6], [x24], #0x2\n"
+    "ld1 { v28.h }[6], [x23], #0x2\n"
+    "ld1 { v27.h }[6], [x22], #0x2\n"
+    "ld1 { v26.h }[6], [x21], #0x2\n"
+    "b 19f\n"
+    "16:"  // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v2.h }[4], [x9], #0x2\n"
+    "ld1 { v1.h }[4], [x28], #0x2\n"
+    "ld1 { v0.h }[4], [x27], #0x2\n"
+    "ld1 { v31.h }[4], [x26], #0x2\n"
+    "ld1 { v30.h }[4], [x25], #0x2\n"
+    "ld1 { v29.h }[4], [x24], #0x2\n"
+    "ld1 { v28.h }[4], [x23], #0x2\n"
+    "ld1 { v27.h }[4], [x22], #0x2\n"
+    "ld1 { v26.h }[4], [x21], #0x2\n"
+    "b 19f\n"
+    "17:"  // Oddments: Planar loop: Load: Bit 2: Unset
+    "tbz %x[n_channels], #1, 18f\n"
+    "ldr s2, [x9], #0x4\n"
+    "ldr s1, [x28], #0x4\n"
+    "ldr s0, [x27], #0x4\n"
+    "ldr s31, [x26], #0x4\n"
+    "ldr s30, [x25], #0x4\n"
+    "ldr s29, [x24], #0x4\n"
+    "ldr s28, [x23], #0x4\n"
+    "ldr s27, [x22], #0x4\n"
+    "ldr s26, [x21], #0x4\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v2.h }[2], [x9], #0x2\n"
+    "ld1 { v1.h }[2], [x28], #0x2\n"
+    "ld1 { v0.h }[2], [x27], #0x2\n"
+    "ld1 { v31.h }[2], [x26], #0x2\n"
+    "ld1 { v30.h }[2], [x25], #0x2\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "ld1 { v28.h }[2], [x23], #0x2\n"
+    "ld1 { v27.h }[2], [x22], #0x2\n"
+    "ld1 { v26.h }[2], [x21], #0x2\n"
+    "b 19f\n"
+    "18:"  // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 19f\n"
+    "ldr h2, [x9], #0x2\n"
+    "ldr h1, [x28], #0x2\n"
+    "ldr h0, [x27], #0x2\n"
+    "ldr h31, [x26], #0x2\n"
+    "ldr h30, [x25], #0x2\n"
+    "ldr h29, [x24], #0x2\n"
+    "ldr h28, [x23], #0x2\n"
+    "ldr h27, [x22], #0x2\n"
+    "ldr h26, [x21], #0x2\n"
+    "19:"  // Oddments: Planar loop: Load: Bit 2: End
+    "subs x19, x19, #0x1\n"
+    "bgt 15b\n"
+    "20:"  // Oddments: Planar tail
+    "fmla v25.8h, v2.8h, v23.8h\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "add x27, x27, x11\n"
+    "fmla v24.8h, v1.8h, v23.8h\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "fmla v22.8h, v0.8h, v23.8h\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "add x26, x26, x11\n"
+    "fmla v21.8h, v31.8h, v23.8h\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "fmla v20.8h, v30.8h, v23.8h\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x25, x25, x11\n"
+    "fmla v19.8h, v29.8h, v23.8h\n"
+    "add x24, x24, x11\n"
+    "fmla v18.8h, v28.8h, v23.8h\n"
+    "add x23, x23, x11\n"
+    "fmla v17.8h, v27.8h, v23.8h\n"
+    "add x22, x22, x11\n"
+    "fmla v16.8h, v26.8h, v23.8h\n"
+    "add x21, x21, x11\n"
+    "fmax v25.8h, v25.8h, v4.8h\n"
+    "add x20, x20, x11\n"
+    "fmax v24.8h, v24.8h, v4.8h\n"
+    "add x19, x19, x11\n"
+    "fmax v22.8h, v22.8h, v4.8h\n"
+    "fmin v25.8h, v25.8h, v3.8h\n"
+    "fmin v24.8h, v24.8h, v3.8h\n"
+    "fmin v22.8h, v22.8h, v3.8h\n"
+    "fmax v21.8h, v21.8h, v4.8h\n"
+    "fmax v20.8h, v20.8h, v4.8h\n"
+    "fmax v19.8h, v19.8h, v4.8h\n"
+    "fmin v21.8h, v21.8h, v3.8h\n"
+    "fmin v20.8h, v20.8h, v3.8h\n"
+    "fmin v19.8h, v19.8h, v3.8h\n"
+    "fmax v18.8h, v18.8h, v4.8h\n"
+    "fmax v17.8h, v17.8h, v4.8h\n"
+    "fmax v16.8h, v16.8h, v4.8h\n"
+    "fmin v18.8h, v18.8h, v3.8h\n"
+    "fmin v17.8h, v17.8h, v3.8h\n"
+    "fmin v16.8h, v16.8h, v3.8h\n"
+    "tbz %x[n_channels], #2, 22f\n"
+    "st1 { v25.d }[0], [x27], #0x8\n"
+    "st1 { v24.d }[0], [x26], #0x8\n"
+    "st1 { v22.d }[0], [x25], #0x8\n"
+    "st1 { v21.d }[0], [x24], #0x8\n"
+    "st1 { v20.d }[0], [x23], #0x8\n"
+    "st1 { v19.d }[0], [x22], #0x8\n"
+    "st1 { v18.d }[0], [x21], #0x8\n"
+    "st1 { v17.d }[0], [x20], #0x8\n"
+    "st1 { v16.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "st1 { v25.s }[2], [x27], #0x4\n"
+    "st1 { v24.s }[2], [x26], #0x4\n"
+    "st1 { v22.s }[2], [x25], #0x4\n"
+    "st1 { v21.s }[2], [x24], #0x4\n"
+    "st1 { v20.s }[2], [x23], #0x4\n"
+    "st1 { v19.s }[2], [x22], #0x4\n"
+    "st1 { v18.s }[2], [x21], #0x4\n"
+    "st1 { v17.s }[2], [x20], #0x4\n"
+    "st1 { v16.s }[2], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v25.h }[6], [x27], #0x2\n"
+    "st1 { v24.h }[6], [x26], #0x2\n"
+    "st1 { v22.h }[6], [x25], #0x2\n"
+    "st1 { v21.h }[6], [x24], #0x2\n"
+    "st1 { v20.h }[6], [x23], #0x2\n"
+    "st1 { v19.h }[6], [x22], #0x2\n"
+    "st1 { v18.h }[6], [x21], #0x2\n"
+    "st1 { v17.h }[6], [x20], #0x2\n"
+    "st1 { v16.h }[6], [x19], #0x2\n"
+    "b 24f\n"
+    "21:"  // Oddments: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v25.h }[4], [x27], #0x2\n"
+    "st1 { v24.h }[4], [x26], #0x2\n"
+    "st1 { v22.h }[4], [x25], #0x2\n"
+    "st1 { v21.h }[4], [x24], #0x2\n"
+    "st1 { v20.h }[4], [x23], #0x2\n"
+    "st1 { v19.h }[4], [x22], #0x2\n"
+    "st1 { v18.h }[4], [x21], #0x2\n"
+    "st1 { v17.h }[4], [x20], #0x2\n"
+    "st1 { v16.h }[4], [x19], #0x2\n"
+    "b 24f\n"
+    "22:"  // Oddments: Store: Bit 2: Unset
+    "tbz %x[n_channels], #1, 23f\n"
+    "st1 { v25.s }[0], [x27], #0x4\n"
+    "st1 { v24.s }[0], [x26], #0x4\n"
+    "st1 { v22.s }[0], [x25], #0x4\n"
+    "st1 { v21.s }[0], [x24], #0x4\n"
+    "st1 { v20.s }[0], [x23], #0x4\n"
+    "st1 { v19.s }[0], [x22], #0x4\n"
+    "st1 { v18.s }[0], [x21], #0x4\n"
+    "st1 { v17.s }[0], [x20], #0x4\n"
+    "st1 { v16.s }[0], [x19], #0x4\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v25.h }[2], [x27], #0x2\n"
+    "st1 { v24.h }[2], [x26], #0x2\n"
+    "st1 { v22.h }[2], [x25], #0x2\n"
+    "st1 { v21.h }[2], [x24], #0x2\n"
+    "st1 { v20.h }[2], [x23], #0x2\n"
+    "st1 { v19.h }[2], [x22], #0x2\n"
+    "st1 { v18.h }[2], [x21], #0x2\n"
+    "st1 { v17.h }[2], [x20], #0x2\n"
+    "st1 { v16.h }[2], [x19], #0x2\n"
+    "b 24f\n"
+    "23:"  // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 24f\n"
+    "st1 { v25.h }[0], [x27], #0x2\n"
+    "st1 { v24.h }[0], [x26], #0x2\n"
+    "st1 { v22.h }[0], [x25], #0x2\n"
+    "st1 { v21.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x23], #0x2\n"
+    "st1 { v19.h }[0], [x22], #0x2\n"
+    "st1 { v18.h }[0], [x21], #0x2\n"
+    "st1 { v17.h }[0], [x20], #0x2\n"
+    "st1 { v16.h }[0], [x19], #0x2\n"
+    "24:"  // Oddments: Store: Bit 2: End
+
+    "25:"  // End
+
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000..a02a2b2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp

@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int output_rows(void) { return 2; };
+  constexpr static unsigned int output_cols(void) { return 8; };
+
+  constexpr static unsigned int output_col_regs(void) { return 1; };
+
+  kern_type kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+  a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..7ed7c52
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp

@@ -0,0 +1,1049 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const __fp16 *const *const inptrs,
+  __fp16 *const *const outptrs,
+  const __fp16 *weights,
+  const __fp16 *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v7.8h }, [%x[minmax_vals]]\n"
+    "mov x10, #0x0\n"
+    "add x19, %x[minmax_vals], #0x2\n"
+    "ld1r { v6.8h }, [x19]\n"
+    "lsr x9, %x[n_output_channels], #0x3\n"
+    "cbz x9, 8f\n"
+    "1:"  // Output channel loop
+    "movi v16.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x19, x10, #0x1\n"
+    "ldr q16, [%x[bias], x19]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v5.16b, v16.16b\n"
+    "ldr q4, [%x[weights], #0x0]\n"
+    "mov x19, %x[inptrs]\n"
+    "mov v31.16b, v16.16b\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "mov v30.16b, v16.16b\n"
+    "ldr q3, [x25, #0x0]\n"
+    "mov v29.16b, v16.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v28.16b, v16.16b\n"
+    "ldr q2, [x28, #0x0]\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "mov v20.16b, v16.16b\n"
+    "mov v19.16b, v16.16b\n"
+    "mov v18.16b, v16.16b\n"
+    "mov v17.16b, v16.16b\n"
+    "cbz x20, 6f\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "ldr q16, [%x[weights], #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q1, [x25, #0x0]\n"
+    "ldr q0, [x28, #0x0]\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "ldr q3, [x25, #0x0]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "ldr q2, [x28, #0x0]\n"
+    "fmla v5.8h, v16.8h, v1.h[0]\n"
+    "ldr q4, [%x[weights], #0x0]\n"
+    "fmla v31.8h, v16.8h, v1.h[1]\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "fmla v30.8h, v16.8h, v1.h[2]\n"
+    "fmla v29.8h, v16.8h, v1.h[3]\n"
+    "fmla v28.8h, v16.8h, v1.h[4]\n"
+    "fmla v27.8h, v16.8h, v1.h[5]\n"
+    "fmla v26.8h, v16.8h, v1.h[6]\n"
+    "fmla v25.8h, v16.8h, v1.h[7]\n"
+    "ldr q1, [x25, #0x0]\n"
+    "fmla v24.8h, v16.8h, v0.h[0]\n"
+    "fmla v23.8h, v16.8h, v0.h[1]\n"
+    "fmla v22.8h, v16.8h, v0.h[2]\n"
+    "fmla v21.8h, v16.8h, v0.h[3]\n"
+    "fmla v20.8h, v16.8h, v0.h[4]\n"
+    "fmla v19.8h, v16.8h, v0.h[5]\n"
+    "fmla v18.8h, v16.8h, v0.h[6]\n"
+    "fmla v17.8h, v16.8h, v0.h[7]\n"
+    "ldr q0, [x28, #0x0]\n"
+    "ldr q16, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "lsl x27, x10, #0x1\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "fmla v5.8h, v16.8h, v1.h[0]\n"
+    "fmla v31.8h, v16.8h, v1.h[1]\n"
+    "fmla v30.8h, v16.8h, v1.h[2]\n"
+    "fmla v29.8h, v16.8h, v1.h[3]\n"
+    "fmla v28.8h, v16.8h, v1.h[4]\n"
+    "fmla v27.8h, v16.8h, v1.h[5]\n"
+    "fmla v26.8h, v16.8h, v1.h[6]\n"
+    "fmla v25.8h, v16.8h, v1.h[7]\n"
+    "fmla v24.8h, v16.8h, v0.h[0]\n"
+    "fmla v23.8h, v16.8h, v0.h[1]\n"
+    "fmla v22.8h, v16.8h, v0.h[2]\n"
+    "fmla v21.8h, v16.8h, v0.h[3]\n"
+    "fmla v20.8h, v16.8h, v0.h[4]\n"
+    "fmla v19.8h, v16.8h, v0.h[5]\n"
+    "fmla v18.8h, v16.8h, v0.h[6]\n"
+    "fmla v17.8h, v16.8h, v0.h[7]\n"
+    "fmin v5.8h, v5.8h, v6.8h\n"
+    "fmin v31.8h, v31.8h, v6.8h\n"
+    "fmin v30.8h, v30.8h, v6.8h\n"
+    "fmax v5.8h, v5.8h, v7.8h\n"
+    "str q5, [x19, x27]\n"
+    "fmax v31.8h, v31.8h, v7.8h\n"
+    "fmax v30.8h, v30.8h, v7.8h\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmin v29.8h, v29.8h, v6.8h\n"
+    "str q31, [x20, x27]\n"
+    "fmin v28.8h, v28.8h, v6.8h\n"
+    "fmin v27.8h, v27.8h, v6.8h\n"
+    "str q30, [x21, x27]\n"
+    "fmax v29.8h, v29.8h, v7.8h\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin v26.8h, v26.8h, v6.8h\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax v28.8h, v28.8h, v7.8h\n"
+    "str q29, [x22, x27]\n"
+    "fmax v27.8h, v27.8h, v7.8h\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax v26.8h, v26.8h, v7.8h\n"
+    "str q28, [x23, x27]\n"
+    "fmin v25.8h, v25.8h, v6.8h\n"
+    "str q27, [x24, x27]\n"
+    "fmin v24.8h, v24.8h, v6.8h\n"
+    "str q26, [x25, x27]\n"
+    "fmin v23.8h, v23.8h, v6.8h\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax v25.8h, v25.8h, v7.8h\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmax v24.8h, v24.8h, v7.8h\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax v23.8h, v23.8h, v7.8h\n"
+    "str q25, [x26, x27]\n"
+    "fmin v22.8h, v22.8h, v6.8h\n"
+    "str q24, [x19, x27]\n"
+    "fmin v21.8h, v21.8h, v6.8h\n"
+    "str q23, [x20, x27]\n"
+    "fmin v20.8h, v20.8h, v6.8h\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax v22.8h, v22.8h, v7.8h\n"
+    "str q22, [x21, x27]\n"
+    "fmax v21.8h, v21.8h, v7.8h\n"
+    "fmax v20.8h, v20.8h, v7.8h\n"
+    "str q21, [x22, x27]\n"
+    "fmin v19.8h, v19.8h, v6.8h\n"
+    "fmin v18.8h, v18.8h, v6.8h\n"
+    "str q20, [x23, x27]\n"
+    "fmin v17.8h, v17.8h, v6.8h\n"
+    "fmax v19.8h, v19.8h, v7.8h\n"
+    "str q19, [x24, x27]\n"
+    "fmax v18.8h, v18.8h, v7.8h\n"
+    "fmax v17.8h, v17.8h, v7.8h\n"
+    "str q18, [x25, x27]\n"
+    "str q17, [x26, x27]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "lsl x27, x10, #0x1\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "ldr q3, [x25, #0x0]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "ldr q2, [x28, #0x0]\n"
+    "fmla v5.8h, v16.8h, v1.h[0]\n"
+    "ldr q4, [%x[weights], #0x0]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v31.8h, v16.8h, v1.h[1]\n"
+    "fmla v30.8h, v16.8h, v1.h[2]\n"
+    "fmla v29.8h, v16.8h, v1.h[3]\n"
+    "fmla v28.8h, v16.8h, v1.h[4]\n"
+    "fmla v27.8h, v16.8h, v1.h[5]\n"
+    "fmla v26.8h, v16.8h, v1.h[6]\n"
+    "fmla v25.8h, v16.8h, v1.h[7]\n"
+    "fmla v24.8h, v16.8h, v0.h[0]\n"
+    "fmla v23.8h, v16.8h, v0.h[1]\n"
+    "fmla v22.8h, v16.8h, v0.h[2]\n"
+    "fmla v21.8h, v16.8h, v0.h[3]\n"
+    "fmla v20.8h, v16.8h, v0.h[4]\n"
+    "fmla v19.8h, v16.8h, v0.h[5]\n"
+    "fmla v18.8h, v16.8h, v0.h[6]\n"
+    "fmla v17.8h, v16.8h, v0.h[7]\n"
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "fmin v5.8h, v5.8h, v6.8h\n"
+    "fmin v31.8h, v31.8h, v6.8h\n"
+    "fmin v30.8h, v30.8h, v6.8h\n"
+    "fmax v5.8h, v5.8h, v7.8h\n"
+    "str q5, [x19, x27]\n"
+    "fmax v31.8h, v31.8h, v7.8h\n"
+    "fmax v30.8h, v30.8h, v7.8h\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmin v29.8h, v29.8h, v6.8h\n"
+    "str q31, [x20, x27]\n"
+    "fmin v28.8h, v28.8h, v6.8h\n"
+    "fmin v27.8h, v27.8h, v6.8h\n"
+    "str q30, [x21, x27]\n"
+    "fmax v29.8h, v29.8h, v7.8h\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin v26.8h, v26.8h, v6.8h\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax v28.8h, v28.8h, v7.8h\n"
+    "str q29, [x22, x27]\n"
+    "fmax v27.8h, v27.8h, v7.8h\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax v26.8h, v26.8h, v7.8h\n"
+    "str q28, [x23, x27]\n"
+    "fmin v25.8h, v25.8h, v6.8h\n"
+    "str q27, [x24, x27]\n"
+    "fmin v24.8h, v24.8h, v6.8h\n"
+    "str q26, [x25, x27]\n"
+    "fmin v23.8h, v23.8h, v6.8h\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax v25.8h, v25.8h, v7.8h\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmax v24.8h, v24.8h, v7.8h\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax v23.8h, v23.8h, v7.8h\n"
+    "str q25, [x26, x27]\n"
+    "fmin v22.8h, v22.8h, v6.8h\n"
+    "str q24, [x19, x27]\n"
+    "fmin v21.8h, v21.8h, v6.8h\n"
+    "str q23, [x20, x27]\n"
+    "fmin v20.8h, v20.8h, v6.8h\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax v22.8h, v22.8h, v7.8h\n"
+    "str q22, [x21, x27]\n"
+    "fmax v21.8h, v21.8h, v7.8h\n"
+    "fmax v20.8h, v20.8h, v7.8h\n"
+    "str q21, [x22, x27]\n"
+    "fmin v19.8h, v19.8h, v6.8h\n"
+    "fmin v18.8h, v18.8h, v6.8h\n"
+    "str q20, [x23, x27]\n"
+    "fmin v17.8h, v17.8h, v6.8h\n"
+    "fmax v19.8h, v19.8h, v7.8h\n"
+    "str q19, [x24, x27]\n"
+    "fmax v18.8h, v18.8h, v7.8h\n"
+    "fmax v17.8h, v17.8h, v7.8h\n"
+    "str q18, [x25, x27]\n"
+    "str q17, [x26, x27]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "lsl x27, x10, #0x1\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "fmin v5.8h, v5.8h, v6.8h\n"
+    "fmin v31.8h, v31.8h, v6.8h\n"
+    "fmin v30.8h, v30.8h, v6.8h\n"
+    "fmax v5.8h, v5.8h, v7.8h\n"
+    "str q5, [x19, x27]\n"
+    "fmax v31.8h, v31.8h, v7.8h\n"
+    "fmax v30.8h, v30.8h, v7.8h\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmin v29.8h, v29.8h, v6.8h\n"
+    "str q31, [x20, x27]\n"
+    "fmin v28.8h, v28.8h, v6.8h\n"
+    "fmin v27.8h, v27.8h, v6.8h\n"
+    "str q30, [x21, x27]\n"
+    "fmax v29.8h, v29.8h, v7.8h\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin v26.8h, v26.8h, v6.8h\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax v28.8h, v28.8h, v7.8h\n"
+    "str q29, [x22, x27]\n"
+    "fmax v27.8h, v27.8h, v7.8h\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax v26.8h, v26.8h, v7.8h\n"
+    "str q28, [x23, x27]\n"
+    "fmin v25.8h, v25.8h, v6.8h\n"
+    "str q27, [x24, x27]\n"
+    "fmin v24.8h, v24.8h, v6.8h\n"
+    "str q26, [x25, x27]\n"
+    "fmin v23.8h, v23.8h, v6.8h\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax v25.8h, v25.8h, v7.8h\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmax v24.8h, v24.8h, v7.8h\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax v23.8h, v23.8h, v7.8h\n"
+    "str q25, [x26, x27]\n"
+    "fmin v22.8h, v22.8h, v6.8h\n"
+    "str q24, [x19, x27]\n"
+    "fmin v21.8h, v21.8h, v6.8h\n"
+    "str q23, [x20, x27]\n"
+    "fmin v20.8h, v20.8h, v6.8h\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax v22.8h, v22.8h, v7.8h\n"
+    "str q22, [x21, x27]\n"
+    "fmax v21.8h, v21.8h, v7.8h\n"
+    "fmax v20.8h, v20.8h, v7.8h\n"
+    "str q21, [x22, x27]\n"
+    "fmin v19.8h, v19.8h, v6.8h\n"
+    "fmin v18.8h, v18.8h, v6.8h\n"
+    "str q20, [x23, x27]\n"
+    "fmin v17.8h, v17.8h, v6.8h\n"
+    "fmax v19.8h, v19.8h, v7.8h\n"
+    "str q19, [x24, x27]\n"
+    "fmax v18.8h, v18.8h, v7.8h\n"
+    "fmax v17.8h, v17.8h, v7.8h\n"
+    "str q18, [x25, x27]\n"
+    "str q17, [x26, x27]\n"
+    "7:"  // Output channel loop: Done
+    "add x10, x10, #0x8\n"
+    "cmp x10, x9, LSL #3\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x7\n"
+    "beq 23f\n"
+    "8:"  // Output channel oddments
+    "movi v16.16b, #0x0\n"
+    "cbz %x[bias], 13f\n"
+    "add x19, %x[bias], x10, LSL #1\n"
+    "tbz %x[n_output_channels], #2, 10f\n"
+    "ld1 { v16.d }[0], [x19], #0x8\n"
+    "tbz %x[n_output_channels], #1, 9f\n"
+    "ld1 { v16.s }[2], [x19], #0x4\n"
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v16.h }[6], [x19]\n"
+    "b 12f\n"
+    "9:"  // Output channel oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v16.h }[4], [x19]\n"
+    "b 12f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 2: Unset
+    "tbz %x[n_output_channels], #1, 11f\n"
+    "ld1 { v16.s }[0], [x19], #0x4\n"
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v16.h }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Output channel oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 12f\n"
+    "ld1 { v16.h }[0], [x19]\n"
+    "12:"  // Output channel oddments: Load bias: Bit 2: End
+
+    "13:"  // Output channel oddments: Load bias: Done
+    "mov v5.16b, v16.16b\n"
+    "ldr q4, [%x[weights], #0x0]\n"
+    "mov x19, %x[inptrs]\n"
+    "mov v31.16b, v16.16b\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "mov v30.16b, v16.16b\n"
+    "ldr q3, [x25, #0x0]\n"
+    "mov v29.16b, v16.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v28.16b, v16.16b\n"
+    "ldr q2, [x28, #0x0]\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "mov v20.16b, v16.16b\n"
+    "mov v19.16b, v16.16b\n"
+    "mov v18.16b, v16.16b\n"
+    "mov v17.16b, v16.16b\n"
+    "cbz x20, 17f\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "ldr q16, [%x[weights], #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q1, [x25, #0x0]\n"
+    "ldr q0, [x28, #0x0]\n"
+    "beq 15f\n"
+    "14:"  // Output channel oddments: Kernel loop
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "ldr q3, [x25, #0x0]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "ldr q2, [x28, #0x0]\n"
+    "fmla v5.8h, v16.8h, v1.h[0]\n"
+    "ldr q4, [%x[weights], #0x0]\n"
+    "fmla v31.8h, v16.8h, v1.h[1]\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "fmla v30.8h, v16.8h, v1.h[2]\n"
+    "fmla v29.8h, v16.8h, v1.h[3]\n"
+    "fmla v28.8h, v16.8h, v1.h[4]\n"
+    "fmla v27.8h, v16.8h, v1.h[5]\n"
+    "fmla v26.8h, v16.8h, v1.h[6]\n"
+    "fmla v25.8h, v16.8h, v1.h[7]\n"
+    "ldr q1, [x25, #0x0]\n"
+    "fmla v24.8h, v16.8h, v0.h[0]\n"
+    "fmla v23.8h, v16.8h, v0.h[1]\n"
+    "fmla v22.8h, v16.8h, v0.h[2]\n"
+    "fmla v21.8h, v16.8h, v0.h[3]\n"
+    "fmla v20.8h, v16.8h, v0.h[4]\n"
+    "fmla v19.8h, v16.8h, v0.h[5]\n"
+    "fmla v18.8h, v16.8h, v0.h[6]\n"
+    "fmla v17.8h, v16.8h, v0.h[7]\n"
+    "ldr q0, [x28, #0x0]\n"
+    "ldr q16, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 14b\n"
+    "15:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 16f\n"
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "fmla v5.8h, v16.8h, v1.h[0]\n"
+    "fmla v31.8h, v16.8h, v1.h[1]\n"
+    "fmla v30.8h, v16.8h, v1.h[2]\n"
+    "fmla v29.8h, v16.8h, v1.h[3]\n"
+    "fmla v28.8h, v16.8h, v1.h[4]\n"
+    "fmla v27.8h, v16.8h, v1.h[5]\n"
+    "fmla v26.8h, v16.8h, v1.h[6]\n"
+    "fmla v25.8h, v16.8h, v1.h[7]\n"
+    "fmla v24.8h, v16.8h, v0.h[0]\n"
+    "fmla v23.8h, v16.8h, v0.h[1]\n"
+    "fmla v22.8h, v16.8h, v0.h[2]\n"
+    "fmla v21.8h, v16.8h, v0.h[3]\n"
+    "fmla v20.8h, v16.8h, v0.h[4]\n"
+    "fmla v19.8h, v16.8h, v0.h[5]\n"
+    "fmla v18.8h, v16.8h, v0.h[6]\n"
+    "fmla v17.8h, v16.8h, v0.h[7]\n"
+    "b 18f\n"
+    "16:"  // Output channel oddments: Odd tail
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "ldp x25, x28, [x19], #0x10\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "ldr q3, [x25, #0x0]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "ldr q2, [x28, #0x0]\n"
+    "fmla v5.8h, v16.8h, v1.h[0]\n"
+    "ldr q4, [%x[weights], #0x0]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v31.8h, v16.8h, v1.h[1]\n"
+    "fmla v30.8h, v16.8h, v1.h[2]\n"
+    "fmla v29.8h, v16.8h, v1.h[3]\n"
+    "fmla v28.8h, v16.8h, v1.h[4]\n"
+    "fmla v27.8h, v16.8h, v1.h[5]\n"
+    "fmla v26.8h, v16.8h, v1.h[6]\n"
+    "fmla v25.8h, v16.8h, v1.h[7]\n"
+    "fmla v24.8h, v16.8h, v0.h[0]\n"
+    "fmla v23.8h, v16.8h, v0.h[1]\n"
+    "fmla v22.8h, v16.8h, v0.h[2]\n"
+    "fmla v21.8h, v16.8h, v0.h[3]\n"
+    "fmla v20.8h, v16.8h, v0.h[4]\n"
+    "fmla v19.8h, v16.8h, v0.h[5]\n"
+    "fmla v18.8h, v16.8h, v0.h[6]\n"
+    "fmla v17.8h, v16.8h, v0.h[7]\n"
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "b 18f\n"
+    "17:"  // Output channel oddments: Single kernel point
+    "fmla v5.8h, v4.8h, v3.h[0]\n"
+    "fmla v31.8h, v4.8h, v3.h[1]\n"
+    "fmla v30.8h, v4.8h, v3.h[2]\n"
+    "fmla v29.8h, v4.8h, v3.h[3]\n"
+    "fmla v28.8h, v4.8h, v3.h[4]\n"
+    "fmla v27.8h, v4.8h, v3.h[5]\n"
+    "fmla v26.8h, v4.8h, v3.h[6]\n"
+    "fmla v25.8h, v4.8h, v3.h[7]\n"
+    "fmla v24.8h, v4.8h, v2.h[0]\n"
+    "fmla v23.8h, v4.8h, v2.h[1]\n"
+    "fmla v22.8h, v4.8h, v2.h[2]\n"
+    "fmla v21.8h, v4.8h, v2.h[3]\n"
+    "fmla v20.8h, v4.8h, v2.h[4]\n"
+    "fmla v19.8h, v4.8h, v2.h[5]\n"
+    "fmla v18.8h, v4.8h, v2.h[6]\n"
+    "fmla v17.8h, v4.8h, v2.h[7]\n"
+    "18:"  // Output channel oddments: Done
+    "fmin v5.8h, v5.8h, v6.8h\n"
+    "fmin v31.8h, v31.8h, v6.8h\n"
+    "fmin v30.8h, v30.8h, v6.8h\n"
+    "fmin v29.8h, v29.8h, v6.8h\n"
+    "fmax v5.8h, v5.8h, v7.8h\n"
+    "fmax v31.8h, v31.8h, v7.8h\n"
+    "fmax v30.8h, v30.8h, v7.8h\n"
+    "fmax v29.8h, v29.8h, v7.8h\n"
+    "fmin v28.8h, v28.8h, v6.8h\n"
+    "fmin v27.8h, v27.8h, v6.8h\n"
+    "fmin v26.8h, v26.8h, v6.8h\n"
+    "fmax v28.8h, v28.8h, v7.8h\n"
+    "fmax v27.8h, v27.8h, v7.8h\n"
+    "fmax v26.8h, v26.8h, v7.8h\n"
+    "fmin v25.8h, v25.8h, v6.8h\n"
+    "fmin v24.8h, v24.8h, v6.8h\n"
+    "fmin v23.8h, v23.8h, v6.8h\n"
+    "fmax v25.8h, v25.8h, v7.8h\n"
+    "fmax v24.8h, v24.8h, v7.8h\n"
+    "fmax v23.8h, v23.8h, v7.8h\n"
+    "fmin v22.8h, v22.8h, v6.8h\n"
+    "fmin v21.8h, v21.8h, v6.8h\n"
+    "fmin v20.8h, v20.8h, v6.8h\n"
+    "fmax v22.8h, v22.8h, v7.8h\n"
+    "fmax v21.8h, v21.8h, v7.8h\n"
+    "fmax v20.8h, v20.8h, v7.8h\n"
+    "fmin v19.8h, v19.8h, v6.8h\n"
+    "fmin v18.8h, v18.8h, v6.8h\n"
+    "fmin v17.8h, v17.8h, v6.8h\n"
+    "fmax v19.8h, v19.8h, v7.8h\n"
+    "fmax v18.8h, v18.8h, v7.8h\n"
+    "fmax v17.8h, v17.8h, v7.8h\n"
+    "tbz %x[n_output_channels], #2, 20f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v5.d }[0], [x19]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v31.d }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v29.d }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v28.d }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v27.d }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v26.d }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "st1 { v25.d }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.d }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.d }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.d }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.d }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.d }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.d }[0], [x25]\n"
+    "add x10, x10, #0x4\n"
+    "st1 { v17.d }[0], [x26]\n"
+    "tbz %x[n_output_channels], #1, 19f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v5.s }[2], [x19]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v31.s }[2], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v29.s }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v28.s }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v27.s }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v26.s }[2], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "st1 { v25.s }[2], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.s }[2], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.s }[2], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.s }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.s }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.s }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.s }[2], [x25]\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v17.s }[2], [x26]\n"
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v5.h }[6], [x19]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v31.h }[6], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v30.h }[6], [x21]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v29.h }[6], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v28.h }[6], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v27.h }[6], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v26.h }[6], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "st1 { v25.h }[6], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[6], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[6], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[6], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[6], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[6], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[6], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[6], [x25]\n"
+    "st1 { v17.h }[6], [x26]\n"
+    "b 22f\n"
+    "19:"  // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v5.h }[4], [x19]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v31.h }[4], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v30.h }[4], [x21]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v29.h }[4], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v28.h }[4], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v27.h }[4], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v26.h }[4], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "st1 { v25.h }[4], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[4], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[4], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[4], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[4], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[4], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[4], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[4], [x25]\n"
+    "st1 { v17.h }[4], [x26]\n"
+    "b 22f\n"
+    "20:"  // Output channel oddments: Done: Store: Bit 2: Unset
+    "tbz %x[n_output_channels], #1, 21f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v5.s }[0], [x19]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v31.s }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v29.s }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v28.s }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v27.s }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v26.s }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "st1 { v25.s }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.s }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.s }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.s }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.s }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.s }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.s }[0], [x25]\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v17.s }[0], [x26]\n"
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v5.h }[2], [x19]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v31.h }[2], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v30.h }[2], [x21]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v29.h }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v28.h }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v27.h }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v26.h }[2], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "st1 { v25.h }[2], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[2], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[2], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[2], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[2], [x25]\n"
+    "st1 { v17.h }[2], [x26]\n"
+    "b 22f\n"
+    "21:"  // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 22f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v5.h }[0], [x19]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v31.h }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v30.h }[0], [x21]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v29.h }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v28.h }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v27.h }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v26.h }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #1\n"
+    "st1 { v25.h }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #1\n"
+    "st1 { v24.h }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #1\n"
+    "st1 { v23.h }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #1\n"
+    "st1 { v22.h }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #1\n"
+    "st1 { v21.h }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #1\n"
+    "st1 { v20.h }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #1\n"
+    "st1 { v19.h }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #1\n"
+    "st1 { v18.h }[0], [x25]\n"
+    "st1 { v17.h }[0], [x26]\n"
+    "22:"  // Output channel oddments: Done: Store: Bit 2: End
+
+    "23:"  // Done
+
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..88f20bb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..fae208f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "1:"  // Tile loop
+    "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x2\n"
+    "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x15, #0x2\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x24, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x22, #0x0\n"
+    "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x17, x23\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x16, x13, x19\n" // offset += tile_j * ld_input_col
+    "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+    "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x12, x12, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1r { v18.4s }, [x24]\n"
+    "add x9, x12, x23, LSL #2\n"
+    "ld1r { v17.4s }, [x21]\n"
+    "add x28, x9, x23, LSL #2\n"
+    "lsl x13, x13, #0x2\n"
+    "add x27, x28, x23, LSL #2\n"
+    "add x26, x13, x13\n"
+    "add x25, x26, x13\n"
+    "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x16, x11, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x15\n" // offset *= output_tile_size
+    "add x10, x10, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "add x24, x10, x20, LSL #2\n"
+    "lsl x11, x11, #0x2\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x2\n"
+    "cbz x19, 4f\n"
+    "ldr q16, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x14, #0x20]\n"
+    "ldr q2, [x14, #0x30]\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q4, [x14, #0x50]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "add x14, x14, #0xa0\n"
+    "ldr q9, [x9, x13]\n"
+    "ld1 { v10.4s }, [x12]\n"
+    "ldr q11, [x12, x25]\n"
+    "ldr q12, [x9, x26]\n"
+    "ldr q13, [x28, x13]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+    "add x22, x22, #0x10\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "add x21, x21, #0x10\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x27]\n"
+    "cmp x21, x19, LSL #4\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x28, x26]\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x27, x25]\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "ldr q16, [x14, #0x0]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v31.4s, v5.4s, v12.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x12, x13]\n"
+    "fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr q9, [x12, x26]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "ld1 { v11.4s }, [x9]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x9, x25]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x14, #0x50]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x28]\n"
+    "ldr q1, [x14, #0x20]\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q0, [x14, #0x10]\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "ldr q2, [x14, #0x30]\n"
+    "fmla v31.4s, v8.4s, v10.4s\n"
+    "fmla v30.4s, v7.4s, v10.4s\n"
+    "ldr q10, [x28, x25]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "ldr q13, [x28, x13]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x27, x13]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q12, [x27, x26]\n"
+    "add x27, x27, #0x10\n"
+    "fmla v28.4s, v5.4s, v10.4s\n"
+    "ldr q3, [x14, #0x40]\n"
+    "ldr q5, [x14, #0x60]\n"
+    "fmla v31.4s, v6.4s, v9.4s\n"
+    "ldr q9, [x9, x13]\n"
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x12]\n"
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "fmla v28.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x12, x25]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "ldr q8, [x14, #0x90]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "ldr q12, [x9, x26]\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "ldr q7, [x14, #0x80]\n"
+    "add x14, x14, #0xa0\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "st1 { v31.4s }, [x10]\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "str q30, [x10, x11]\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "st1 { v29.4s }, [x24]\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "add x10, x10, #0x10\n"
+    "str q28, [x24, x11]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x27]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x28, x26]\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x27, x25]\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v31.4s, v5.4s, v12.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x12, x13]\n"
+    "fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr q9, [x12, x26]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "ld1 { v11.4s }, [x9]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x9, x25]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x28]\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "fmla v31.4s, v8.4s, v10.4s\n"
+    "fmla v30.4s, v7.4s, v10.4s\n"
+    "ldr q10, [x28, x25]\n"
+    "add x28, x28, #0x10\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x27, x13]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q12, [x27, x26]\n"
+    "add x27, x27, #0x10\n"
+    "fmla v28.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v6.4s, v9.4s\n"
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "fmla v28.4s, v6.4s, v11.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "st1 { v31.4s }, [x10]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "str q30, [x10, x11]\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "add x10, x10, #0x10\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "st1 { v29.4s }, [x24]\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "str q28, [x24, x11]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 31f\n"
+    "ldr q16, [x14, #0x0]\n"
+    "ldr q0, [x14, #0x10]\n"
+    "add x23, x9, x13\n"
+    "ldr q1, [x14, #0x20]\n"
+    "add x22, x12, XZR\n"
+    "ldr q2, [x14, #0x30]\n"
+    "add x21, x12, x25\n"
+    "ldr q3, [x14, #0x40]\n"
+    "add x20, x9, x26\n"
+    "ldr q4, [x14, #0x50]\n"
+    "add x19, x28, x13\n"
+    "ldr q5, [x14, #0x60]\n"
+    "ldr q6, [x14, #0x70]\n"
+    "ldr q7, [x14, #0x80]\n"
+    "ldr q8, [x14, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x23], #0x8\n"
+    "ldr d10, [x22], #0x8\n"
+    "ldr d11, [x21], #0x8\n"
+    "ldr d12, [x20], #0x8\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x23]\n"
+    "ld1 { v10.s }[2], [x22]\n"
+    "ld1 { v11.s }[2], [x21]\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+    "ldr s9, [x23, #0x0]\n"
+    "ldr s10, [x22, #0x0]\n"
+    "ldr s11, [x21, #0x0]\n"
+    "ldr s12, [x20, #0x0]\n"
+    "ldr s13, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+    "add x19, x27, XZR\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v31.4s, v5.4s, v12.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v29.4s, v6.4s, v9.4s\n"
+    "add x19, x27, x25\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "add x19, x12, x13\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "add x19, x12, x26\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "add x19, x28, x26\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+    "fmla v31.4s, v8.4s, v10.4s\n"
+    "add x19, x9, XZR\n"
+    "fmla v30.4s, v7.4s, v10.4s\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "add x19, x9, x25\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "add x19, x28, XZR\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v31.4s, v6.4s, v9.4s\n"
+    "add x19, x28, x25\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "add x19, x27, x13\n"
+    "fmla v28.4s, v5.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "add x19, x27, x26\n"
+    "fmla v28.4s, v6.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "mov x19, x10\n"
+    "st1 { v31.d }[0], [x19], x11\n"
+    "add x10, x10, #0x8\n"
+    "st1 { v30.d }[0], [x19]\n"
+    "mov x19, x24\n"
+    "st1 { v29.d }[0], [x19], x11\n"
+    "add x24, x24, #0x8\n"
+    "st1 { v28.d }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "mov x20, x10\n"
+    "st1 { v31.s }[2], [x20], x11\n"
+    "mov x19, x24\n"
+    "st1 { v30.s }[2], [x20]\n"
+    "st1 { v29.s }[2], [x19], x11\n"
+    "st1 { v28.s }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x20, x10\n"
+    "st1 { v31.s }[0], [x20], x11\n"
+    "mov x19, x24\n"
+    "st1 { v30.s }[0], [x20]\n"
+    "st1 { v29.s }[0], [x19], x11\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "30:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "31:"  // Tile loop: End
+    "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x17, #0x1\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x16, x16, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x16, x19\n"
+    "csel x16, x16, XZR, LT\n"
+    "csel x17, x17, x21, LT\n"
+    "cmp x17, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..2f93a68
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[16];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[5];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[3];
+      inptrs[3] = input_ptrs[6];
+      inptrs[4] = input_ptrs[9];
+      inptrs[5] = input_ptrs[12];
+      inptrs[6] = input_ptrs[15];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[2];
+      inptrs[9] = input_ptrs[10];
+      inptrs[10] = input_ptrs[4];
+      inptrs[11] = input_ptrs[7];
+      inptrs[12] = input_ptrs[8];
+      inptrs[13] = input_ptrs[11];
+      inptrs[14] = input_ptrs[13];
+      inptrs[15] = input_ptrs[14];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v18.4s }, [x20]\n"
+    "ld1r { v17.4s }, [x19]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x21, #0x0]\n"
+    "mov x11, #0x10\n" // cntb _, ALL, #1
+    "ldp x10, x9, [x21, #0x10]\n"
+    "sub x28, XZR, x11\n"
+    "lsr x27, %x[n_channels], #0x2\n"
+    "cbz x27, 3f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x11, x27, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldr x22, [x16, #0x20]\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr q10, [x25, x14]\n"
+    "ldr q11, [x24, x14]\n"
+    "ldr q12, [x23, x14]\n"
+    "ldr q13, [x22, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+    "ldr x20, [x16, #0x30]\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "ldr x19, [x16, #0x38]\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x21, x14]\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr x25, [x16, #0x48]\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q10, [x25, x14]\n"
+    "ldr x24, [x16, #0x50]\n"
+    "fmla v31.4s, v5.4s, v12.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr q12, [x19, x14]\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldr q16, [x15, #0x0]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q4, [x15, #0x50]\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q9, [x22, x14]\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "ldr x22, [x16, #0x20]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v31.4s, v8.4s, v10.4s\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v30.4s, v7.4s, v10.4s\n"
+    "ldr q10, [x21, x14]\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "ldr q13, [x22, x11]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x20, x14]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "fmla v28.4s, v5.4s, v10.4s\n"
+    "ldr q12, [x19, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.4s, v6.4s, v9.4s\n"
+    "ldr q9, [x26, x11]\n"
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "ldr q10, [x25, x11]\n"
+    "fmla v28.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x24, x11]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "ldr q8, [x15, #0x90]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "ldr q12, [x23, x11]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "ldr q7, [x15, #0x80]\n"
+    "cmp x11, x27, LSL #4\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "str q31, [x13, x28]\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "add x15, x15, #0xa0\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "str q30, [x12, x28]\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "str q29, [x10, x28]\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "str q28, [x9, x28]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+    "ldr x20, [x16, #0x30]\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "ldr x19, [x16, #0x38]\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x21, x14]\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr x25, [x16, #0x48]\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q10, [x25, x14]\n"
+    "ldr x24, [x16, #0x50]\n"
+    "fmla v31.4s, v5.4s, v12.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr q12, [x19, x14]\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "ldr q9, [x22, x14]\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "fmla v31.4s, v8.4s, v10.4s\n"
+    "fmla v30.4s, v7.4s, v10.4s\n"
+    "ldr q10, [x21, x14]\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q12, [x19, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.4s, v5.4s, v10.4s\n"
+    "fmla v31.4s, v6.4s, v9.4s\n"
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "fmla v28.4s, v6.4s, v11.4s\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "str q31, [x13, x28]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "str q30, [x12, x28]\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "str q29, [x10, x28]\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "str q28, [x9, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 30f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x28, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "add x13, x13, x28\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x12, x12, x28\n"
+    "ldr q3, [x15, #0x40]\n"
+    "add x10, x10, x28\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x9, x9, x28\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "ldr x26, [x16, #0x0]\n"
+    "ldr x25, [x16, #0x8]\n"
+    "add x26, x26, x14\n"
+    "ldr x24, [x16, #0x10]\n"
+    "ldr x23, [x16, #0x18]\n"
+    "add x25, x25, x14\n"
+    "ldr x22, [x16, #0x20]\n"
+    "add x24, x24, x14\n"
+    "add x23, x23, x14\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x26], #0x8\n"
+    "ld1 { v10.d }[0], [x25], #0x8\n"
+    "ld1 { v11.d }[0], [x24], #0x8\n"
+    "ld1 { v12.d }[0], [x23], #0x8\n"
+    "ld1 { v13.d }[0], [x22], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x26], #0x4\n"
+    "ld1 { v10.s }[2], [x25], #0x4\n"
+    "ld1 { v11.s }[2], [x24], #0x4\n"
+    "ld1 { v12.s }[2], [x23], #0x4\n"
+    "ld1 { v13.s }[2], [x22], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v13.s }[0], [x22], #0x4\n"
+    "5:"  // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x21, x21, x14\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v31.4s, v5.4s, v12.4s\n"
+    "fmla v30.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v9.d }[0], [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v9.s }[2], [x21], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x21], #0x4\n"
+    "7:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr x20, [x16, #0x30]\n"
+    "fmla v31.4s, v7.4s, v13.4s\n"
+    "add x20, x20, x14\n"
+    "fmla v30.4s, v6.4s, v13.4s\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "9:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "ldr x19, [x16, #0x38]\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v12.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v12.s }[2], [x19], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x19], #0x4\n"
+    "11:"  // Oddments: Load input (0, 1): Bit 1: End
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "ldr x26, [x16, #0x40]\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v9.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v9.s }[2], [x26], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "13:"  // Oddments: Load input (0, 2): Bit 1: End
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr x25, [x16, #0x48]\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v10.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v10.s }[2], [x25], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "15:"  // Oddments: Load input (2, 2): Bit 1: End
+    "fmla v31.4s, v8.4s, v10.4s\n"
+    "ldr x24, [x16, #0x50]\n"
+    "fmla v30.4s, v7.4s, v10.4s\n"
+    "add x24, x24, x14\n"
+    "fmla v29.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x24], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "17:"  // Oddments: Load input (1, 0): Bit 1: End
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v12.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v12.s }[2], [x23], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "19:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v28.4s, v2.4s, v12.4s\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "21:"  // Oddments: Load input (2, 0): Bit 1: End
+    "fmla v31.4s, v6.4s, v9.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "add x21, x21, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v10.d }[0], [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x21], #0x4\n"
+    "23:"  // Oddments: Load input (2, 3): Bit 1: End
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v28.4s, v5.4s, v10.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "25:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v29.4s, v7.4s, v11.4s\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v28.4s, v6.4s, v11.4s\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v12.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v12.s }[2], [x19], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v12.s }[0], [x19], #0x4\n"
+    "27:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "st1 { v31.d }[0], [x13], #0x8\n"
+    "st1 { v30.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x10], #0x8\n"
+    "st1 { v28.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "st1 { v31.s }[2], [x13], #0x4\n"
+    "st1 { v30.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x10], #0x4\n"
+    "st1 { v28.s }[2], [x9], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v31.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x10], #0x4\n"
+    "st1 { v28.s }[0], [x9], #0x4\n"
+    "29:"  // Oddments: Store: Bit 1: End
+
+    "30:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000..6a882ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..401528a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,825 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x7, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x3\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x25, #0x3\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x24, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x23, #0x0\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x8, x16, x19\n" // offset += tile_j * ld_input_col
+    "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+    "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x15, x15, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1r { v18.4s }, [x24]\n"
+    "add x12, x15, x22, LSL #2\n"
+    "ld1r { v17.4s }, [x21]\n"
+    "add x11, x12, x22, LSL #2\n"
+    "lsl x16, x16, #0x2\n"
+    "add x10, x11, x22, LSL #2\n"
+    "add x9, x10, x22, LSL #2\n"
+    "add x28, x16, x16\n"
+    "add x27, x28, x16\n"
+    "add x26, x27, x16\n"
+    "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x8, x14, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x25\n" // offset *= output_tile_size
+    "add x13, x13, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "add x25, x13, x20, LSL #2\n"
+    "add x24, x25, x20, LSL #2\n"
+    "lsl x14, x14, #0x2\n"
+    "add x22, x14, x14\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x2\n"
+    "cbz x19, 4f\n"
+    "ldr q16, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x17, #0x20]\n"
+    "ldr q2, [x17, #0x30]\n"
+    "ldr q3, [x17, #0x40]\n"
+    "ldr q4, [x17, #0x50]\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "add x17, x17, #0xa0\n"
+    "ldr q9, [x11, x28]\n"
+    "ld1 { v10.4s }, [x15]\n"
+    "ldr q11, [x15, x26]\n"
+    "ld1 { v12.4s }, [x9]\n"
+    "ldr q13, [x12, x28]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "add x23, x23, #0x10\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "add x21, x21, #0x10\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+    "cmp x21, x19, LSL #4\n"
+    "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+    "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+    "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+    "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+    "ldr q16, [x17, #0x0]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x11, x27]\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x11, x16]\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x9, x26]\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "fmla v26.4s, v0.4s, v13.4s\n"
+    "ldr q13, [x15, x16]\n"
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "ldr q12, [x15, x27]\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "fmla v30.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v25.4s, v1.4s, v11.4s\n"
+    "fmla v24.4s, v0.4s, v11.4s\n"
+    "ld1 { v11.4s }, [x12]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "ldr q13, [x12, x26]\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "ld1 { v12.4s }, [x10]\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "fmla v24.4s, v2.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "ldr q10, [x10, x28]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x10, x26]\n"
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x9, x16]\n"
+    "fmla v25.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x12, x16]\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmla v26.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v24.4s, v4.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v26.4s, v8.4s, v11.4s\n"
+    "fmla v25.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v6.4s, v13.4s\n"
+    "ldr q13, [x9, x27]\n"
+    "fmla v23.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x12, x27]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x10, x16]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v5.4s, v11.4s\n"
+    "fmla v26.4s, v1.4s, v11.4s\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x15, x28]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v24.4s, v8.4s, v13.4s\n"
+    "ld1 { v10.4s }, [x15]\n"
+    "fmla v23.4s, v7.4s, v13.4s\n"
+    "ldr q13, [x10, x27]\n"
+    "add x10, x10, #0x10\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "ld1 { v12.4s }, [x11]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "ldr q1, [x17, #0x20]\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x11, x26]\n"
+    "add x11, x11, #0x10\n"
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "ldr q9, [x11, x28]\n"
+    "fmla v26.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v5.4s, v13.4s\n"
+    "fmla v23.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x9, x28]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "ldr q4, [x17, #0x50]\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr q3, [x17, #0x40]\n"
+    "fmla v25.4s, v0.4s, v12.4s\n"
+    "ld1 { v12.4s }, [x9]\n"
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "ldr q0, [x17, #0x10]\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "ldr q5, [x17, #0x60]\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x15, x26]\n"
+    "fmla v25.4s, v8.4s, v13.4s\n"
+    "ldr q2, [x17, #0x30]\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "ldr q7, [x17, #0x80]\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "ldr q8, [x17, #0x90]\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "ldr q13, [x12, x28]\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "ldr q6, [x17, #0x70]\n"
+    "add x17, x17, #0xa0\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "st1 { v31.4s }, [x13]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "str q30, [x13, x14]\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "str q29, [x13, x22]\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmax v27.4s, v27.4s, v18.4s\n"
+    "st1 { v28.4s }, [x25]\n"
+    "fmax v26.4s, v26.4s, v18.4s\n"
+    "fmax v25.4s, v25.4s, v18.4s\n"
+    "fmin v27.4s, v27.4s, v17.4s\n"
+    "str q27, [x25, x14]\n"
+    "fmin v26.4s, v26.4s, v17.4s\n"
+    "fmin v25.4s, v25.4s, v17.4s\n"
+    "str q26, [x25, x22]\n"
+    "fmax v24.4s, v24.4s, v18.4s\n"
+    "add x25, x25, #0x10\n"
+    "fmax v23.4s, v23.4s, v18.4s\n"
+    "st1 { v25.4s }, [x24]\n"
+    "fmin v24.4s, v24.4s, v17.4s\n"
+    "str q24, [x24, x14]\n"
+    "fmin v23.4s, v23.4s, v17.4s\n"
+    "str q23, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+    "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+    "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+    "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x11, x27]\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x11, x16]\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x9, x26]\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "fmla v26.4s, v0.4s, v13.4s\n"
+    "ldr q13, [x15, x16]\n"
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "ldr q12, [x15, x27]\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "fmla v30.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v25.4s, v1.4s, v11.4s\n"
+    "fmla v24.4s, v0.4s, v11.4s\n"
+    "ld1 { v11.4s }, [x12]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "ldr q13, [x12, x26]\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "ld1 { v12.4s }, [x10]\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "fmla v24.4s, v2.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "ldr q10, [x10, x28]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x10, x26]\n"
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x9, x16]\n"
+    "fmla v25.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x12, x16]\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmla v26.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v24.4s, v4.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v26.4s, v8.4s, v11.4s\n"
+    "fmla v25.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v6.4s, v13.4s\n"
+    "ldr q13, [x9, x27]\n"
+    "fmla v23.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x12, x27]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x10, x16]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v5.4s, v11.4s\n"
+    "fmla v26.4s, v1.4s, v11.4s\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x15, x28]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v24.4s, v8.4s, v13.4s\n"
+    "fmla v23.4s, v7.4s, v13.4s\n"
+    "ldr q13, [x10, x27]\n"
+    "add x10, x10, #0x10\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "ld1 { v12.4s }, [x11]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x11, x26]\n"
+    "add x11, x11, #0x10\n"
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "fmla v26.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v5.4s, v13.4s\n"
+    "fmla v23.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x9, x28]\n"
+    "add x9, x9, #0x10\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v0.4s, v12.4s\n"
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "fmla v25.4s, v8.4s, v13.4s\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "st1 { v31.4s }, [x13]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "str q30, [x13, x14]\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "str q29, [x13, x22]\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "add x13, x13, #0x10\n"
+    "fmax v27.4s, v27.4s, v18.4s\n"
+    "st1 { v28.4s }, [x25]\n"
+    "fmax v26.4s, v26.4s, v18.4s\n"
+    "fmax v25.4s, v25.4s, v18.4s\n"
+    "fmin v27.4s, v27.4s, v17.4s\n"
+    "str q27, [x25, x14]\n"
+    "fmin v26.4s, v26.4s, v17.4s\n"
+    "fmin v25.4s, v25.4s, v17.4s\n"
+    "str q26, [x25, x22]\n"
+    "fmax v24.4s, v24.4s, v18.4s\n"
+    "add x25, x25, #0x10\n"
+    "fmax v23.4s, v23.4s, v18.4s\n"
+    "st1 { v25.4s }, [x24]\n"
+    "fmin v24.4s, v24.4s, v17.4s\n"
+    "str q24, [x24, x14]\n"
+    "fmin v23.4s, v23.4s, v17.4s\n"
+    "str q23, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 49f\n"
+    "ldr q16, [x17, #0x0]\n"
+    "ldr q0, [x17, #0x10]\n"
+    "add x23, x11, x28\n"
+    "ldr q1, [x17, #0x20]\n"
+    "add x22, x15, XZR\n"
+    "ldr q2, [x17, #0x30]\n"
+    "add x21, x15, x26\n"
+    "ldr q3, [x17, #0x40]\n"
+    "add x20, x9, XZR\n"
+    "ldr q4, [x17, #0x50]\n"
+    "add x19, x12, x28\n"
+    "ldr q5, [x17, #0x60]\n"
+    "ldr q6, [x17, #0x70]\n"
+    "ldr q7, [x17, #0x80]\n"
+    "ldr q8, [x17, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x23], #0x8\n"
+    "ldr d10, [x22], #0x8\n"
+    "ldr d11, [x21], #0x8\n"
+    "ldr d12, [x20], #0x8\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x23]\n"
+    "ld1 { v10.s }[2], [x22]\n"
+    "ld1 { v11.s }[2], [x21]\n"
+    "ld1 { v12.s }[2], [x20]\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+    "ldr s9, [x23, #0x0]\n"
+    "ldr s10, [x22, #0x0]\n"
+    "ldr s11, [x21, #0x0]\n"
+    "ldr s12, [x20, #0x0]\n"
+    "ldr s13, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "add x19, x9, x26\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+    "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+    "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+    "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "fmla v26.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "add x19, x11, x16\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "add x19, x15, x16\n"
+    "fmla v30.4s, v6.4s, v11.4s\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v25.4s, v1.4s, v11.4s\n"
+    "fmla v24.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "add x19, x15, x27\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "add x19, x11, x27\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "add x19, x12, XZR\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v24.4s, v2.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "add x19, x12, x26\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "add x19, x10, XZR\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "add x19, x10, x28\n"
+    "fmla v25.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "add x19, x10, x26\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "fmla v26.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v24.4s, v4.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v26.4s, v8.4s, v11.4s\n"
+    "add x19, x9, x16\n"
+    "fmla v23.4s, v5.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v25.4s, v7.4s, v13.4s\n"
+    "add x19, x12, x16\n"
+    "fmla v24.4s, v6.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "add x19, x12, x27\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v30.4s, v5.4s, v11.4s\n"
+    "add x19, x9, x27\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "fmla v26.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v24.4s, v8.4s, v13.4s\n"
+    "add x19, x10, x16\n"
+    "fmla v23.4s, v7.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "add x19, x15, x28\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "add x19, x10, x27\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "add x19, x11, XZR\n"
+    "fmla v26.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v5.4s, v13.4s\n"
+    "fmla v23.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "add x19, x11, x26\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "add x19, x9, x28\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v25.4s, v8.4s, v13.4s\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "fmax v27.4s, v27.4s, v18.4s\n"
+    "fmax v26.4s, v26.4s, v18.4s\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "fmin v27.4s, v27.4s, v17.4s\n"
+    "fmin v26.4s, v26.4s, v17.4s\n"
+    "fmax v25.4s, v25.4s, v18.4s\n"
+    "fmax v24.4s, v24.4s, v18.4s\n"
+    "fmax v23.4s, v23.4s, v18.4s\n"
+    "fmin v25.4s, v25.4s, v17.4s\n"
+    "fmin v24.4s, v24.4s, v17.4s\n"
+    "fmin v23.4s, v23.4s, v17.4s\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "mov x19, x13\n"
+    "st1 { v31.d }[0], [x19], x14\n"
+    "add x13, x13, #0x8\n"
+    "st1 { v30.d }[0], [x19], x14\n"
+    "mov x20, x25\n"
+    "st1 { v29.d }[0], [x19]\n"
+    "st1 { v28.d }[0], [x20], x14\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v27.d }[0], [x20], x14\n"
+    "mov x19, x24\n"
+    "st1 { v26.d }[0], [x20]\n"
+    "add x24, x24, #0x8\n"
+    "st1 { v25.d }[0], [x19], x14\n"
+    "st1 { v24.d }[0], [x19], x14\n"
+    "st1 { v23.d }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "mov x21, x13\n"
+    "st1 { v31.s }[2], [x21], x14\n"
+    "mov x20, x25\n"
+    "st1 { v30.s }[2], [x21], x14\n"
+    "st1 { v28.s }[2], [x20], x14\n"
+    "mov x19, x24\n"
+    "st1 { v29.s }[2], [x21]\n"
+    "st1 { v27.s }[2], [x20], x14\n"
+    "st1 { v26.s }[2], [x20]\n"
+    "st1 { v25.s }[2], [x19], x14\n"
+    "st1 { v24.s }[2], [x19], x14\n"
+    "st1 { v23.s }[2], [x19]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x21, x13\n"
+    "st1 { v31.s }[0], [x21], x14\n"
+    "mov x20, x25\n"
+    "mov x19, x24\n"
+    "st1 { v30.s }[0], [x21], x14\n"
+    "st1 { v28.s }[0], [x20], x14\n"
+    "st1 { v29.s }[0], [x21]\n"
+    "st1 { v27.s }[0], [x20], x14\n"
+    "st1 { v26.s }[0], [x20]\n"
+    "st1 { v25.s }[0], [x19], x14\n"
+    "st1 { v24.s }[0], [x19], x14\n"
+    "st1 { v23.s }[0], [x19]\n"
+    "48:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "49:"  // Tile loop: End
+    "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x7, #0x1\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x8, x8, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x8, x19\n"
+    "csel x8, x8, XZR, LT\n"
+    "csel x7, x7, x21, LT\n"
+    "cmp x7, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..39ec001
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,903 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v18.4s }, [x20]\n"
+    "ld1r { v17.4s }, [x19]\n"
+    "mov x14, #0x0\n"
+    "mov x13, #0x10\n" // cntb _, ALL, #1
+    "sub x12, XZR, x13\n"
+    "lsr x11, %x[n_channels], #0x2\n"
+    "cbz x11, 3f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x13, x11, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldp x10, x9, [x16, #0x0]\n"
+    "ldp x28, x27, [x16, #0x10]\n"
+    "ldr x26, [x16, #0x20]\n"
+    "ldr q9, [x10, x14]\n"
+    "ldr q10, [x9, x14]\n"
+    "ldr q11, [x28, x14]\n"
+    "ldr q12, [x27, x14]\n"
+    "ldr q13, [x26, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x25, [x16, #0x28]\n"
+    "add x12, x12, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr x23, [x16, #0x38]\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+    "ldr x10, [x16, #0x40]\n"
+    "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "ldr x9, [x16, #0x48]\n"
+    "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+    "ldr x28, [x16, #0x50]\n"
+    "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+    "ldr x27, [x16, #0x58]\n"
+    "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+    "ldr x26, [x16, #0x60]\n"
+    "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x9, x14]\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x25, x14]\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "ldr x21, [x17, #0x8]\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla v26.4s, v0.4s, v13.4s\n"
+    "ldr q13, [x23, x14]\n"
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "ldr q12, [x10, x14]\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v30.4s, v6.4s, v11.4s\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "ldr q16, [x15, #0x0]\n"
+    "fmla v25.4s, v1.4s, v11.4s\n"
+    "fmla v24.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x28, x14]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "ldr q13, [x27, x14]\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x26, x14]\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v24.4s, v2.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x23, x14]\n"
+    "fmla v25.4s, v3.4s, v12.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x10, x14]\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v26.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v24.4s, v4.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v26.4s, v8.4s, v11.4s\n"
+    "fmla v25.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v6.4s, v13.4s\n"
+    "ldr q13, [x28, x14]\n"
+    "fmla v23.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x9, x14]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v5.4s, v11.4s\n"
+    "fmla v26.4s, v1.4s, v11.4s\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x26, x14]\n"
+    "fmla v24.4s, v8.4s, v13.4s\n"
+    "ldr x26, [x16, #0x20]\n"
+    "fmla v23.4s, v7.4s, v13.4s\n"
+    "ldr q13, [x25, x14]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x23, x14]\n"
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "fmla v26.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v5.4s, v13.4s\n"
+    "fmla v23.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x10, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "ldp x10, x9, [x16, #0x0]\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldp x28, x27, [x16, #0x10]\n"
+    "fmla v25.4s, v0.4s, v12.4s\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "ldr q9, [x10, x13]\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "ldr q10, [x9, x13]\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x28, x13]\n"
+    "fmla v25.4s, v8.4s, v13.4s\n"
+    "ldr q12, [x27, x13]\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "ldr q13, [x26, x13]\n"
+    "add x13, x13, #0x10\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "ldr q4, [x15, #0x50]\n"
+    "cmp x13, x11, LSL #4\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "str q31, [x22, x12]\n"
+    "fmax v27.4s, v27.4s, v18.4s\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "ldr q7, [x15, #0x80]\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "str q30, [x21, x12]\n"
+    "fmin v27.4s, v27.4s, v17.4s\n"
+    "str q29, [x20, x12]\n"
+    "fmax v26.4s, v26.4s, v18.4s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmax v25.4s, v25.4s, v18.4s\n"
+    "str q28, [x19, x12]\n"
+    "fmax v24.4s, v24.4s, v18.4s\n"
+    "str q27, [x22, x12]\n"
+    "fmin v26.4s, v26.4s, v17.4s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmin v25.4s, v25.4s, v17.4s\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin v24.4s, v24.4s, v17.4s\n"
+    "str q26, [x21, x12]\n"
+    "fmax v23.4s, v23.4s, v18.4s\n"
+    "str q25, [x20, x12]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin v23.4s, v23.4s, v17.4s\n"
+    "str q24, [x19, x12]\n"
+    "str q23, [x22, x12]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x25, [x16, #0x28]\n"
+    "add x12, x12, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr x23, [x16, #0x38]\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+    "ldr x10, [x16, #0x40]\n"
+    "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "ldr x9, [x16, #0x48]\n"
+    "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+    "ldr x28, [x16, #0x50]\n"
+    "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+    "ldr x27, [x16, #0x58]\n"
+    "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+    "ldr x26, [x16, #0x60]\n"
+    "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x9, x14]\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x25, x14]\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "ldr x21, [x17, #0x8]\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla v26.4s, v0.4s, v13.4s\n"
+    "ldr q13, [x23, x14]\n"
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "ldr q12, [x10, x14]\n"
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v30.4s, v6.4s, v11.4s\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v25.4s, v1.4s, v11.4s\n"
+    "fmla v24.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x28, x14]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "ldr q13, [x27, x14]\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x26, x14]\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v24.4s, v2.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x23, x14]\n"
+    "fmla v25.4s, v3.4s, v12.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x10, x14]\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v26.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "fmla v24.4s, v4.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v26.4s, v8.4s, v11.4s\n"
+    "fmla v25.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v6.4s, v13.4s\n"
+    "ldr q13, [x28, x14]\n"
+    "fmla v23.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x9, x14]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "fmla v30.4s, v5.4s, v11.4s\n"
+    "fmla v26.4s, v1.4s, v11.4s\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x26, x14]\n"
+    "fmla v24.4s, v8.4s, v13.4s\n"
+    "fmla v23.4s, v7.4s, v13.4s\n"
+    "ldr q13, [x25, x14]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x23, x14]\n"
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "fmla v26.4s, v7.4s, v13.4s\n"
+    "fmla v24.4s, v5.4s, v13.4s\n"
+    "fmla v23.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x10, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v25.4s, v0.4s, v12.4s\n"
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "fmla v25.4s, v8.4s, v13.4s\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "str q31, [x22, x12]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "str q30, [x21, x12]\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmax v27.4s, v27.4s, v18.4s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "str q29, [x20, x12]\n"
+    "fmin v27.4s, v27.4s, v17.4s\n"
+    "fmax v26.4s, v26.4s, v18.4s\n"
+    "str q28, [x19, x12]\n"
+    "fmax v25.4s, v25.4s, v18.4s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmax v24.4s, v24.4s, v18.4s\n"
+    "str q27, [x22, x12]\n"
+    "fmin v26.4s, v26.4s, v17.4s\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin v25.4s, v25.4s, v17.4s\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin v24.4s, v24.4s, v17.4s\n"
+    "str q26, [x21, x12]\n"
+    "fmax v23.4s, v23.4s, v18.4s\n"
+    "str q25, [x20, x12]\n"
+    "str q24, [x19, x12]\n"
+    "fmin v23.4s, v23.4s, v17.4s\n"
+    "str q23, [x22, x12]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 48f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x12, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "ldr x10, [x16, #0x0]\n"
+    "add x10, x10, x14\n"
+    "ldr x9, [x16, #0x8]\n"
+    "ldr x28, [x16, #0x10]\n"
+    "add x9, x9, x14\n"
+    "ldr x27, [x16, #0x18]\n"
+    "ldr x26, [x16, #0x20]\n"
+    "add x28, x28, x14\n"
+    "add x27, x27, x14\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x10], #0x8\n"
+    "ld1 { v10.d }[0], [x9], #0x8\n"
+    "ld1 { v11.d }[0], [x28], #0x8\n"
+    "ld1 { v12.d }[0], [x27], #0x8\n"
+    "ld1 { v13.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x10], #0x4\n"
+    "ld1 { v10.s }[2], [x9], #0x4\n"
+    "ld1 { v11.s }[2], [x28], #0x4\n"
+    "ld1 { v12.s }[2], [x27], #0x4\n"
+    "ld1 { v13.s }[2], [x26], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x10], #0x4\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "ld1 { v11.s }[0], [x28], #0x4\n"
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "ld1 { v13.s }[0], [x26], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x25, [x16, #0x28]\n"
+    "add x25, x25, x14\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+    "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+    "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n"
+    "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+    "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n"
+    "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v25.4s, v6.4s, v12.4s\n"
+    "fmla v30.4s, v4.4s, v13.4s\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v13.4s\n"
+    "fmla v27.4s, v1.4s, v13.4s\n"
+    "fmla v26.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v12.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v12.s }[2], [x25], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x25], #0x4\n"
+    "7:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v23.4s, v8.4s, v12.4s\n"
+    "ldr x24, [x16, #0x30]\n"
+    "add x24, x24, x14\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x24], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "9:"  // Oddments: Load input (2, 1): Bit 1: End
+    "fmla v31.4s, v7.4s, v11.4s\n"
+    "ldr x23, [x16, #0x38]\n"
+    "fmla v30.4s, v6.4s, v11.4s\n"
+    "add x23, x23, x14\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "fmla v27.4s, v3.4s, v11.4s\n"
+    "fmla v25.4s, v1.4s, v11.4s\n"
+    "fmla v24.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v13.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v13.s }[2], [x23], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x23], #0x4\n"
+    "11:"  // Oddments: Load input (0, 1): Bit 1: End
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "ldr x10, [x16, #0x40]\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "add x10, x10, x14\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.d }[0], [x10], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v12.s }[2], [x10], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x10], #0x4\n"
+    "13:"  // Oddments: Load input (0, 3): Bit 1: End
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "ldr x9, [x16, #0x48]\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "add x9, x9, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v10.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v10.s }[2], [x9], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "15:"  // Oddments: Load input (2, 3): Bit 1: End
+    "fmla v30.4s, v8.4s, v10.4s\n"
+    "ldr x28, [x16, #0x50]\n"
+    "fmla v29.4s, v7.4s, v10.4s\n"
+    "add x28, x28, x14\n"
+    "fmla v27.4s, v5.4s, v10.4s\n"
+    "fmla v26.4s, v4.4s, v10.4s\n"
+    "fmla v24.4s, v2.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x28], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x28], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x28], #0x4\n"
+    "17:"  // Oddments: Load input (1, 0): Bit 1: End
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr x27, [x16, #0x58]\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.d }[0], [x27], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.s }[2], [x27], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v13.s }[0], [x27], #0x4\n"
+    "19:"  // Oddments: Load input (1, 4): Bit 1: End
+    "fmla v29.4s, v5.4s, v13.4s\n"
+    "ldr x26, [x16, #0x60]\n"
+    "fmla v26.4s, v2.4s, v13.4s\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v12.s }[2], [x26], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v12.s }[0], [x26], #0x4\n"
+    "21:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v25.4s, v3.4s, v12.4s\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v10.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v10.s }[2], [x25], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "23:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v28.4s, v8.4s, v10.4s\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v27.4s, v7.4s, v10.4s\n"
+    "add x24, x24, x14\n"
+    "fmla v26.4s, v6.4s, v10.4s\n"
+    "fmla v25.4s, v5.4s, v10.4s\n"
+    "fmla v24.4s, v4.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v11.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v11.s }[2], [x24], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "25:"  // Oddments: Load input (3, 4): Bit 1: End
+    "fmla v26.4s, v8.4s, v11.4s\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v23.4s, v5.4s, v11.4s\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v13.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v13.s }[2], [x23], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x23], #0x4\n"
+    "27:"  // Oddments: Load input (4, 1): Bit 1: End
+    "fmla v25.4s, v7.4s, v13.4s\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v24.4s, v6.4s, v13.4s\n"
+    "add x10, x10, x14\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v12.d }[0], [x10], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v12.s }[2], [x10], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (1, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x10], #0x4\n"
+    "29:"  // Oddments: Load input (1, 1): Bit 1: End
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "add x9, x9, x14\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v27.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.s }[2], [x9], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x9], #0x4\n"
+    "31:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v30.4s, v5.4s, v11.4s\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "add x28, x28, x14\n"
+    "fmla v27.4s, v2.4s, v11.4s\n"
+    "fmla v26.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v13.d }[0], [x28], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v13.s }[2], [x28], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x28], #0x4\n"
+    "33:"  // Oddments: Load input (4, 3): Bit 1: End
+    "fmla v24.4s, v8.4s, v13.4s\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v23.4s, v7.4s, v13.4s\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v12.d }[0], [x27], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v12.s }[2], [x27], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "35:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v27.4s, v6.4s, v12.4s\n"
+    "add x26, x26, x14\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "fmla v24.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v11.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v11.s }[2], [x26], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v11.s }[0], [x26], #0x4\n"
+    "37:"  // Oddments: Load input (0, 2): Bit 1: End
+    "fmla v31.4s, v2.4s, v11.4s\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v30.4s, v1.4s, v11.4s\n"
+    "add x25, x25, x14\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v13.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v13.s }[2], [x25], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x25], #0x4\n"
+    "39:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v27.4s, v8.4s, v13.4s\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v26.4s, v7.4s, v13.4s\n"
+    "add x24, x24, x14\n"
+    "fmla v24.4s, v5.4s, v13.4s\n"
+    "fmla v23.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v12.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "41:"  // Oddments: Load input (2, 0): Bit 1: End
+    "fmla v31.4s, v6.4s, v12.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "add x23, x23, x14\n"
+    "fmla v25.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v11.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v11.s }[2], [x23], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x23], #0x4\n"
+    "43:"  // Oddments: Load input (2, 4): Bit 1: End
+    "fmla v29.4s, v8.4s, v11.4s\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v26.4s, v5.4s, v11.4s\n"
+    "add x10, x10, x14\n"
+    "fmla v23.4s, v2.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v13.d }[0], [x10], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v13.s }[2], [x10], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v13.s }[0], [x10], #0x4\n"
+    "45:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v25.4s, v8.4s, v13.4s\n"
+    "fmla v24.4s, v7.4s, v13.4s\n"
+    "fmla v23.4s, v6.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "fmax v27.4s, v27.4s, v18.4s\n"
+    "fmax v26.4s, v26.4s, v18.4s\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "fmin v27.4s, v27.4s, v17.4s\n"
+    "fmin v26.4s, v26.4s, v17.4s\n"
+    "fmax v25.4s, v25.4s, v18.4s\n"
+    "fmax v24.4s, v24.4s, v18.4s\n"
+    "fmax v23.4s, v23.4s, v18.4s\n"
+    "fmin v25.4s, v25.4s, v17.4s\n"
+    "fmin v24.4s, v24.4s, v17.4s\n"
+    "fmin v23.4s, v23.4s, v17.4s\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ldr x22, [x17, #0x0]\n"
+    "ldr x21, [x17, #0x8]\n"
+    "add x22, x22, x12\n"
+    "ldr x20, [x17, #0x10]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.d }[0], [x22]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.d }[0], [x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.d }[0], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.d }[0], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.d }[0], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.d }[0], [x19]\n"
+    "add x12, x12, #0x8\n"
+    "st1 { v23.d }[0], [x22]\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ldr x22, [x17, #0x0]\n"
+    "ldr x21, [x17, #0x8]\n"
+    "add x22, x22, x12\n"
+    "ldr x20, [x17, #0x10]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.s }[2], [x22]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.s }[2], [x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.s }[2], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.s }[2], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.s }[2], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.s }[2], [x19]\n"
+    "st1 { v23.s }[2], [x22]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Store: Bit 1: Unset
+    "ldr x22, [x17, #0x0]\n"
+    "add x22, x22, x12\n"
+    "ldr x21, [x17, #0x8]\n"
+    "ldr x20, [x17, #0x10]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.s }[0], [x22]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.s }[0], [x20]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.s }[0], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.s }[0], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.s }[0], [x19]\n"
+    "st1 { v23.s }[0], [x22]\n"
+    "47:"  // Oddments: Store: Bit 1: End
+
+    "48:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000..84bac12
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..616fd0d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,1229 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x4, #0x0\n"
+    "mov x26, #0x0\n"
+    "1:"  // Tile loop
+    "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x25, #0x4\n"
+    "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x24, #0x4\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x23, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x7, #0x0\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x4, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x26, x6, x19\n" // offset += tile_j * ld_input_col
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+    "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1r { v15.4s }, [x23]\n"
+    "add x15, x8, x22, LSL #2\n"
+    "ld1r { v14.4s }, [x21]\n"
+    "add x14, x15, x22, LSL #2\n"
+    "lsl x6, x6, #0x2\n"
+    "add x13, x14, x22, LSL #2\n"
+    "add x12, x13, x22, LSL #2\n"
+    "add x11, x12, x22, LSL #2\n"
+    "add x10, x6, x6\n"
+    "add x9, x10, x6\n"
+    "add x28, x9, x6\n"
+    "add x27, x28, x6\n"
+    "mul x19, x4, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x26, x17, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x24\n" // offset *= output_tile_size
+    "add x16, x16, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "add x26, x16, x20, LSL #2\n"
+    "add x25, x26, x20, LSL #2\n"
+    "add x24, x25, x20, LSL #2\n"
+    "lsl x17, x17, #0x2\n"
+    "add x23, x17, x17\n"
+    "add x22, x23, x17\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x2\n"
+    "cbz x19, 4f\n"
+    "ldr q13, [x5, #0x0]\n"
+    "ldr q0, [x5, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x5, #0x20]\n"
+    "ldr q2, [x5, #0x30]\n"
+    "ldr q3, [x5, #0x40]\n"
+    "ldr q4, [x5, #0x50]\n"
+    "ldr q5, [x5, #0x60]\n"
+    "ldr q6, [x5, #0x70]\n"
+    "ldr q7, [x5, #0x80]\n"
+    "ldr q8, [x5, #0x90]\n"
+    "add x5, x5, #0xa0\n"
+    "ldr q9, [x14, x10]\n"
+    "ld1 { v10.4s }, [x8]\n"
+    "ldr q11, [x8, x27]\n"
+    "ldr q12, [x14, x9]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "add x7, x7, #0x10\n"
+    "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "add x21, x21, #0x10\n"
+    "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+    "cmp x21, x19, LSL #4\n"
+    "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+    "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+    "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x13, x10]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x11]\n"
+    "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x11, x27]\n"
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x8, x6]\n"
+    "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x13, x9]\n"
+    "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x8, x28]\n"
+    "fmla v27.4s, v8.4s, v9.4s\n"
+    "fmla v26.4s, v7.4s, v9.4s\n"
+    "fmla v25.4s, v6.4s, v9.4s\n"
+    "fmla v23.4s, v5.4s, v9.4s\n"
+    "fmla v22.4s, v4.4s, v9.4s\n"
+    "fmla v21.4s, v3.4s, v9.4s\n"
+    "fmla v19.4s, v2.4s, v9.4s\n"
+    "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+    "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x15]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "ldr q13, [x5, #0x0]\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x15, x27]\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "ld1 { v11.4s }, [x12]\n"
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "fmla v24.4s, v6.4s, v10.4s\n"
+    "fmla v22.4s, v5.4s, v10.4s\n"
+    "fmla v21.4s, v4.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "fmla v18.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x15, x10]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x15, x9]\n"
+    "fmla v23.4s, v6.4s, v11.4s\n"
+    "fmla v19.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x12, x27]\n"
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v2.4s, v10.4s\n"
+    "fmla v26.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x14, x6]\n"
+    "fmla v20.4s, v8.4s, v11.4s\n"
+    "fmla v16.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x11, x6]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "fmla v24.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x14, x28]\n"
+    "fmla v19.4s, v7.4s, v11.4s\n"
+    "fmla v18.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x11, x28]\n"
+    "fmla v31.4s, v7.4s, v10.4s\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "fmla v27.4s, v4.4s, v10.4s\n"
+    "fmla v26.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x8, x10]\n"
+    "fmla v17.4s, v8.4s, v11.4s\n"
+    "fmla v16.4s, v7.4s, v11.4s\n"
+    "ldr q11, [x13, x6]\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v4.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v20.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x8, x9]\n"
+    "add x8, x8, #0x10\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x14]\n"
+    "fmla v27.4s, v7.4s, v11.4s\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v4.4s, v11.4s\n"
+    "fmla v22.4s, v3.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "fmla v18.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x13, x28]\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x14, x27]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.4s, v6.4s, v10.4s\n"
+    "ldr q9, [x14, x10]\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x13]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v24.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v5.4s, v11.4s\n"
+    "fmla v20.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x12, x10]\n"
+    "fmla v28.4s, v8.4s, v12.4s\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v20.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x13, x27]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v27.4s, v6.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v19.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x11, x10]\n"
+    "fmla v22.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v8.4s, v11.4s\n"
+    "fmla v19.4s, v5.4s, v11.4s\n"
+    "fmla v18.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x12, x9]\n"
+    "fmla v24.4s, v8.4s, v12.4s\n"
+    "fmla v20.4s, v5.4s, v12.4s\n"
+    "fmla v16.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x11, x9]\n"
+    "add x11, x11, #0x10\n"
+    "fmla v19.4s, v8.4s, v10.4s\n"
+    "fmla v18.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x15, x6]\n"
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v21.4s, v7.4s, v11.4s\n"
+    "fmla v20.4s, v6.4s, v11.4s\n"
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v17.4s, v4.4s, v11.4s\n"
+    "fmla v16.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x15, x28]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v17.4s, v7.4s, v12.4s\n"
+    "fmla v16.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x12, x6]\n"
+    "fmla v30.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v1.4s, v10.4s\n"
+    "fmla v26.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x12, x28]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "ldr q0, [x5, #0x10]\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x5, #0x30]\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x8, x27]\n"
+    "fmla v23.4s, v7.4s, v12.4s\n"
+    "ldr q1, [x5, #0x20]\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "ldr q6, [x5, #0x70]\n"
+    "fmla v19.4s, v4.4s, v12.4s\n"
+    "fmla v18.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x14, x9]\n"
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "ldr q3, [x5, #0x40]\n"
+    "fmla v20.4s, v7.4s, v10.4s\n"
+    "ldr q7, [x5, #0x80]\n"
+    "fmla v17.4s, v5.4s, v10.4s\n"
+    "ldr q5, [x5, #0x60]\n"
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x8]\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "ldr q4, [x5, #0x50]\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "ldr q8, [x5, #0x90]\n"
+    "add x5, x5, #0xa0\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "st1 { v31.4s }, [x16]\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "str q30, [x16, x17]\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "str q29, [x16, x23]\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "str q28, [x16, x22]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "st1 { v27.4s }, [x26]\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "str q26, [x26, x17]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "str q25, [x26, x23]\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "str q24, [x26, x22]\n"
+    "add x26, x26, #0x10\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "st1 { v23.4s }, [x25]\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q22, [x25, x17]\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "fmax v19.4s, v19.4s, v15.4s\n"
+    "str q21, [x25, x23]\n"
+    "fmax v18.4s, v18.4s, v15.4s\n"
+    "str q20, [x25, x22]\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "add x25, x25, #0x10\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "st1 { v19.4s }, [x24]\n"
+    "fmax v17.4s, v17.4s, v15.4s\n"
+    "fmax v16.4s, v16.4s, v15.4s\n"
+    "str q18, [x24, x17]\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "str q17, [x24, x23]\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "str q16, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+    "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+    "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+    "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x13, x10]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x11]\n"
+    "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x11, x27]\n"
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x8, x6]\n"
+    "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x13, x9]\n"
+    "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x8, x28]\n"
+    "fmla v27.4s, v8.4s, v9.4s\n"
+    "fmla v26.4s, v7.4s, v9.4s\n"
+    "fmla v25.4s, v6.4s, v9.4s\n"
+    "fmla v23.4s, v5.4s, v9.4s\n"
+    "fmla v22.4s, v4.4s, v9.4s\n"
+    "fmla v21.4s, v3.4s, v9.4s\n"
+    "fmla v19.4s, v2.4s, v9.4s\n"
+    "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+    "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x15]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x15, x27]\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "ld1 { v11.4s }, [x12]\n"
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "fmla v24.4s, v6.4s, v10.4s\n"
+    "fmla v22.4s, v5.4s, v10.4s\n"
+    "fmla v21.4s, v4.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "fmla v18.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x15, x10]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x15, x9]\n"
+    "fmla v23.4s, v6.4s, v11.4s\n"
+    "fmla v19.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x12, x27]\n"
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v2.4s, v10.4s\n"
+    "fmla v26.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x14, x6]\n"
+    "fmla v20.4s, v8.4s, v11.4s\n"
+    "fmla v16.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x11, x6]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "fmla v24.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x14, x28]\n"
+    "fmla v19.4s, v7.4s, v11.4s\n"
+    "fmla v18.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x11, x28]\n"
+    "fmla v31.4s, v7.4s, v10.4s\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "fmla v27.4s, v4.4s, v10.4s\n"
+    "fmla v26.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x8, x10]\n"
+    "fmla v17.4s, v8.4s, v11.4s\n"
+    "fmla v16.4s, v7.4s, v11.4s\n"
+    "ldr q11, [x13, x6]\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v4.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v20.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x8, x9]\n"
+    "add x8, x8, #0x10\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x14]\n"
+    "fmla v27.4s, v7.4s, v11.4s\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v4.4s, v11.4s\n"
+    "fmla v22.4s, v3.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "fmla v18.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x13, x28]\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x14, x27]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.4s, v6.4s, v10.4s\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x13]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "fmla v24.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v5.4s, v11.4s\n"
+    "fmla v20.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x12, x10]\n"
+    "fmla v28.4s, v8.4s, v12.4s\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v20.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x13, x27]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v27.4s, v6.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v19.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x11, x10]\n"
+    "fmla v22.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v8.4s, v11.4s\n"
+    "fmla v19.4s, v5.4s, v11.4s\n"
+    "fmla v18.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x12, x9]\n"
+    "fmla v24.4s, v8.4s, v12.4s\n"
+    "fmla v20.4s, v5.4s, v12.4s\n"
+    "fmla v16.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x11, x9]\n"
+    "add x11, x11, #0x10\n"
+    "fmla v19.4s, v8.4s, v10.4s\n"
+    "fmla v18.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x15, x6]\n"
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v21.4s, v7.4s, v11.4s\n"
+    "fmla v20.4s, v6.4s, v11.4s\n"
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v17.4s, v4.4s, v11.4s\n"
+    "fmla v16.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x15, x28]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v17.4s, v7.4s, v12.4s\n"
+    "fmla v16.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x12, x6]\n"
+    "fmla v30.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v1.4s, v10.4s\n"
+    "fmla v26.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x12, x28]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "fmla v23.4s, v7.4s, v12.4s\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "fmla v19.4s, v4.4s, v12.4s\n"
+    "fmla v18.4s, v3.4s, v12.4s\n"
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "fmla v20.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v5.4s, v10.4s\n"
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "st1 { v31.4s }, [x16]\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "str q30, [x16, x17]\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "str q29, [x16, x23]\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "str q28, [x16, x22]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "add x16, x16, #0x10\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "st1 { v27.4s }, [x26]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "str q26, [x26, x17]\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "str q25, [x26, x23]\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "str q24, [x26, x22]\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "add x26, x26, #0x10\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "st1 { v23.4s }, [x25]\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "str q22, [x25, x17]\n"
+    "fmax v19.4s, v19.4s, v15.4s\n"
+    "fmax v18.4s, v18.4s, v15.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q21, [x25, x23]\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "str q20, [x25, x22]\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "add x25, x25, #0x10\n"
+    "fmax v17.4s, v17.4s, v15.4s\n"
+    "st1 { v19.4s }, [x24]\n"
+    "fmax v16.4s, v16.4s, v15.4s\n"
+    "str q18, [x24, x17]\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "str q17, [x24, x23]\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "str q16, [x24, x22]\n"
+    "add x24, x24, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 73f\n"
+    "ldr q13, [x5, #0x0]\n"
+    "ldr q0, [x5, #0x10]\n"
+    "add x22, x14, x10\n"
+    "ldr q1, [x5, #0x20]\n"
+    "add x21, x8, XZR\n"
+    "ldr q2, [x5, #0x30]\n"
+    "add x20, x8, x27\n"
+    "ldr q3, [x5, #0x40]\n"
+    "add x19, x14, x9\n"
+    "ldr q4, [x5, #0x50]\n"
+    "ldr q5, [x5, #0x60]\n"
+    "ldr q6, [x5, #0x70]\n"
+    "ldr q7, [x5, #0x80]\n"
+    "ldr q8, [x5, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x22], #0x8\n"
+    "ldr d10, [x21], #0x8\n"
+    "ldr d11, [x20], #0x8\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x22]\n"
+    "ld1 { v10.s }[2], [x21]\n"
+    "ld1 { v11.s }[2], [x20]\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+    "ldr s9, [x22, #0x0]\n"
+    "ldr s10, [x21, #0x0]\n"
+    "ldr s11, [x20, #0x0]\n"
+    "ldr s12, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+    "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "add x19, x11, XZR\n"
+    "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+    "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+    "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+    "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+    "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+    "add x19, x11, x27\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+    "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+    "add x19, x13, x10\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v27.4s, v8.4s, v9.4s\n"
+    "add x19, x8, x6\n"
+    "fmla v26.4s, v7.4s, v9.4s\n"
+    "fmla v25.4s, v6.4s, v9.4s\n"
+    "fmla v23.4s, v5.4s, v9.4s\n"
+    "fmla v22.4s, v4.4s, v9.4s\n"
+    "fmla v21.4s, v3.4s, v9.4s\n"
+    "fmla v19.4s, v2.4s, v9.4s\n"
+    "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+    "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "add x19, x8, x28\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "add x19, x13, x9\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "add x19, x15, XZR\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "fmla v24.4s, v6.4s, v10.4s\n"
+    "fmla v22.4s, v5.4s, v10.4s\n"
+    "fmla v21.4s, v4.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "fmla v18.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "add x19, x15, x27\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "add x19, x12, XZR\n"
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v23.4s, v6.4s, v11.4s\n"
+    "add x19, x15, x10\n"
+    "fmla v19.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "add x19, x12, x27\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v2.4s, v10.4s\n"
+    "fmla v26.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+    "fmla v20.4s, v8.4s, v11.4s\n"
+    "add x19, x15, x9\n"
+    "fmla v16.4s, v5.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "add x19, x11, x6\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "fmla v24.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+    "fmla v19.4s, v7.4s, v11.4s\n"
+    "add x19, x14, x6\n"
+    "fmla v18.4s, v6.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v31.4s, v7.4s, v10.4s\n"
+    "add x19, x11, x28\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "fmla v27.4s, v4.4s, v10.4s\n"
+    "fmla v26.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+    "fmla v17.4s, v8.4s, v11.4s\n"
+    "add x19, x14, x28\n"
+    "fmla v16.4s, v7.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "add x19, x8, x10\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v4.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v20.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "add x19, x13, x6\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v27.4s, v7.4s, v11.4s\n"
+    "add x19, x8, x9\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v4.4s, v11.4s\n"
+    "fmla v22.4s, v3.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "fmla v18.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "add x19, x14, XZR\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v31.4s, v6.4s, v10.4s\n"
+    "add x19, x13, x28\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "add x19, x14, x27\n"
+    "fmla v24.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v5.4s, v11.4s\n"
+    "fmla v20.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 50f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 50f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+    "fmla v28.4s, v8.4s, v12.4s\n"
+    "add x19, x13, XZR\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v20.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v27.4s, v6.4s, v10.4s\n"
+    "add x19, x12, x10\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v19.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 54f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 54f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v23.4s, v8.4s, v11.4s\n"
+    "add x19, x13, x27\n"
+    "fmla v22.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "fmla v19.4s, v5.4s, v11.4s\n"
+    "fmla v18.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+    "fmla v24.4s, v8.4s, v12.4s\n"
+    "add x19, x11, x10\n"
+    "fmla v20.4s, v5.4s, v12.4s\n"
+    "fmla v16.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 58f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 58f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+    "fmla v19.4s, v8.4s, v10.4s\n"
+    "add x19, x12, x9\n"
+    "fmla v18.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 59f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "60:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "add x19, x11, x9\n"
+    "fmla v21.4s, v7.4s, v11.4s\n"
+    "fmla v20.4s, v6.4s, v11.4s\n"
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v17.4s, v4.4s, v11.4s\n"
+    "fmla v16.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 61f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 62f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 62f\n"
+    "61:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "62:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "add x19, x15, x6\n"
+    "fmla v17.4s, v7.4s, v12.4s\n"
+    "fmla v16.4s, v6.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 63f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 64f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 64f\n"
+    "63:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "64:"  // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "add x19, x15, x28\n"
+    "fmla v30.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v1.4s, v10.4s\n"
+    "fmla v26.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 65f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 66f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 66f\n"
+    "65:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "66:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "add x19, x12, x6\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 67f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 68f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 68f\n"
+    "67:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "68:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v23.4s, v7.4s, v12.4s\n"
+    "add x19, x12, x28\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "fmla v19.4s, v4.4s, v12.4s\n"
+    "fmla v18.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 69f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 70f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 70f\n"
+    "69:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "70:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "fmla v20.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v5.4s, v10.4s\n"
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "fmax v19.4s, v19.4s, v15.4s\n"
+    "fmax v18.4s, v18.4s, v15.4s\n"
+    "fmax v17.4s, v17.4s, v15.4s\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "fmax v16.4s, v16.4s, v15.4s\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 71f\n"
+    "mov x19, x16\n"
+    "st1 { v31.d }[0], [x19], x17\n"
+    "add x16, x16, #0x8\n"
+    "st1 { v30.d }[0], [x19], x17\n"
+    "mov x21, x26\n"
+    "st1 { v29.d }[0], [x19], x17\n"
+    "st1 { v27.d }[0], [x21], x17\n"
+    "add x26, x26, #0x8\n"
+    "st1 { v28.d }[0], [x19]\n"
+    "mov x20, x25\n"
+    "st1 { v26.d }[0], [x21], x17\n"
+    "add x25, x25, #0x8\n"
+    "st1 { v25.d }[0], [x21], x17\n"
+    "mov x19, x24\n"
+    "st1 { v24.d }[0], [x21]\n"
+    "add x24, x24, #0x8\n"
+    "st1 { v23.d }[0], [x20], x17\n"
+    "st1 { v22.d }[0], [x20], x17\n"
+    "st1 { v21.d }[0], [x20], x17\n"
+    "st1 { v20.d }[0], [x20]\n"
+    "st1 { v19.d }[0], [x19], x17\n"
+    "st1 { v18.d }[0], [x19], x17\n"
+    "st1 { v17.d }[0], [x19], x17\n"
+    "st1 { v16.d }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 72f\n"
+    "mov x22, x16\n"
+    "st1 { v31.s }[2], [x22], x17\n"
+    "mov x21, x26\n"
+    "st1 { v30.s }[2], [x22], x17\n"
+    "st1 { v27.s }[2], [x21], x17\n"
+    "mov x20, x25\n"
+    "st1 { v29.s }[2], [x22], x17\n"
+    "mov x19, x24\n"
+    "st1 { v28.s }[2], [x22]\n"
+    "st1 { v26.s }[2], [x21], x17\n"
+    "st1 { v25.s }[2], [x21], x17\n"
+    "st1 { v24.s }[2], [x21]\n"
+    "st1 { v23.s }[2], [x20], x17\n"
+    "st1 { v22.s }[2], [x20], x17\n"
+    "st1 { v21.s }[2], [x20], x17\n"
+    "st1 { v20.s }[2], [x20]\n"
+    "st1 { v19.s }[2], [x19], x17\n"
+    "st1 { v18.s }[2], [x19], x17\n"
+    "st1 { v17.s }[2], [x19], x17\n"
+    "st1 { v16.s }[2], [x19]\n"
+    "b 72f\n"
+    "71:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x22, x16\n"
+    "st1 { v31.s }[0], [x22], x17\n"
+    "mov x21, x26\n"
+    "mov x20, x25\n"
+    "st1 { v30.s }[0], [x22], x17\n"
+    "st1 { v27.s }[0], [x21], x17\n"
+    "mov x19, x24\n"
+    "st1 { v29.s }[0], [x22], x17\n"
+    "st1 { v28.s }[0], [x22]\n"
+    "st1 { v26.s }[0], [x21], x17\n"
+    "st1 { v25.s }[0], [x21], x17\n"
+    "st1 { v24.s }[0], [x21]\n"
+    "st1 { v23.s }[0], [x20], x17\n"
+    "st1 { v22.s }[0], [x20], x17\n"
+    "st1 { v21.s }[0], [x20], x17\n"
+    "st1 { v20.s }[0], [x20]\n"
+    "st1 { v19.s }[0], [x19], x17\n"
+    "st1 { v18.s }[0], [x19], x17\n"
+    "st1 { v17.s }[0], [x19], x17\n"
+    "st1 { v16.s }[0], [x19]\n"
+    "72:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "73:"  // Tile loop: End
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x4, #0x1\n"
+    "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x26, x26, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x26, x19\n"
+    "csel x26, x26, XZR, LT\n"
+    "csel x4, x4, x21, LT\n"
+    "cmp x4, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..51a5679
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,1395 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "mov x14, #0x0\n"
+    "mov x13, #0x10\n" // cntb _, ALL, #1
+    "sub x12, XZR, x13\n"
+    "lsr x11, %x[n_channels], #0x2\n"
+    "cbz x11, 3f\n"
+    "ldr q13, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x13, x11, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldp x10, x9, [x16, #0x0]\n"
+    "ldp x28, x27, [x16, #0x10]\n"
+    "ldr q9, [x10, x14]\n"
+    "ldr q10, [x9, x14]\n"
+    "ldr q11, [x28, x14]\n"
+    "ldr q12, [x27, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x26, [x16, #0x20]\n"
+    "add x12, x12, #0x10\n"
+    "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "ldr x25, [x16, #0x28]\n"
+    "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+    "ldr x23, [x16, #0x38]\n"
+    "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+    "ldr x10, [x16, #0x40]\n"
+    "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+    "ldr x9, [x16, #0x48]\n"
+    "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "ldr x28, [x16, #0x50]\n"
+    "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "ldr x27, [x16, #0x58]\n"
+    "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x24, x14]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x26, x14]\n"
+    "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "ldr x26, [x16, #0x60]\n"
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "ldr x21, [x17, #0x8]\n"
+    "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x23, x14]\n"
+    "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x9, x14]\n"
+    "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v27.4s, v8.4s, v9.4s\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v26.4s, v7.4s, v9.4s\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v25.4s, v6.4s, v9.4s\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v23.4s, v5.4s, v9.4s\n"
+    "fmla v22.4s, v4.4s, v9.4s\n"
+    "fmla v21.4s, v3.4s, v9.4s\n"
+    "fmla v19.4s, v2.4s, v9.4s\n"
+    "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+    "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x28, x14]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x26, x14]\n"
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "ldr q13, [x15, #0x0]\n"
+    "fmla v24.4s, v6.4s, v10.4s\n"
+    "fmla v22.4s, v5.4s, v10.4s\n"
+    "fmla v21.4s, v4.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "fmla v18.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v23.4s, v6.4s, v11.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v19.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v2.4s, v10.4s\n"
+    "fmla v26.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x9, x14]\n"
+    "fmla v20.4s, v8.4s, v11.4s\n"
+    "ldr x9, [x16, #0xc8]\n"
+    "fmla v16.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "fmla v24.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v19.4s, v7.4s, v11.4s\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla v18.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x28, x14]\n"
+    "fmla v31.4s, v7.4s, v10.4s\n"
+    "ldr x28, [x16, #0xd0]\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "fmla v27.4s, v4.4s, v10.4s\n"
+    "fmla v26.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x26, x14]\n"
+    "fmla v17.4s, v8.4s, v11.4s\n"
+    "ldr x26, [x16, #0xe0]\n"
+    "fmla v16.4s, v7.4s, v11.4s\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v4.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v20.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x23, x14]\n"
+    "fmla v27.4s, v7.4s, v11.4s\n"
+    "ldr x23, [x16, #0xf8]\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v4.4s, v11.4s\n"
+    "fmla v22.4s, v3.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "fmla v18.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "ldr x10, [x16, #0x100]\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x9, x14]\n"
+    "fmla v31.4s, v6.4s, v10.4s\n"
+    "ldr x9, [x16, #0x108]\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x28, x14]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "ldr x28, [x16, #0x110]\n"
+    "fmla v24.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v5.4s, v11.4s\n"
+    "fmla v20.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x27, x14]\n"
+    "fmla v28.4s, v8.4s, v12.4s\n"
+    "ldr x27, [x16, #0x118]\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v20.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x26, x14]\n"
+    "fmla v27.4s, v6.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v19.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v22.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v8.4s, v11.4s\n"
+    "fmla v19.4s, v5.4s, v11.4s\n"
+    "fmla v18.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v24.4s, v8.4s, v12.4s\n"
+    "fmla v20.4s, v5.4s, v12.4s\n"
+    "fmla v16.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v19.4s, v8.4s, v10.4s\n"
+    "fmla v18.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x10, x14]\n"
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v21.4s, v7.4s, v11.4s\n"
+    "fmla v20.4s, v6.4s, v11.4s\n"
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v17.4s, v4.4s, v11.4s\n"
+    "fmla v16.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x9, x14]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "ldp x10, x9, [x16, #0x0]\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "ldr q9, [x10, x13]\n"
+    "fmla v17.4s, v7.4s, v12.4s\n"
+    "fmla v16.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x28, x14]\n"
+    "fmla v30.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v1.4s, v10.4s\n"
+    "fmla v26.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x27, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "ldp x28, x27, [x16, #0x10]\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x28, x13]\n"
+    "fmla v23.4s, v7.4s, v12.4s\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "ldr q6, [x15, #0x70]\n"
+    "fmla v19.4s, v4.4s, v12.4s\n"
+    "fmla v18.4s, v3.4s, v12.4s\n"
+    "ldr q12, [x27, x13]\n"
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "ldr q3, [x15, #0x40]\n"
+    "fmla v20.4s, v7.4s, v10.4s\n"
+    "ldr q7, [x15, #0x80]\n"
+    "fmla v17.4s, v5.4s, v10.4s\n"
+    "ldr q5, [x15, #0x60]\n"
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "ldr q10, [x9, x13]\n"
+    "add x13, x13, #0x10\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "ldr q4, [x15, #0x50]\n"
+    "cmp x13, x11, LSL #4\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "str q31, [x22, x12]\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "str q30, [x21, x12]\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "str q29, [x20, x12]\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "str q28, [x19, x12]\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "str q27, [x22, x12]\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "str q26, [x21, x12]\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "str q25, [x20, x12]\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "str q24, [x19, x12]\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "ldr x20, [x17, #0x50]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "str q23, [x22, x12]\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "str q22, [x21, x12]\n"
+    "fmax v19.4s, v19.4s, v15.4s\n"
+    "str q21, [x20, x12]\n"
+    "fmax v18.4s, v18.4s, v15.4s\n"
+    "ldr x19, [x17, #0x58]\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "ldr x22, [x17, #0x60]\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "ldr x21, [x17, #0x68]\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "str q20, [x19, x12]\n"
+    "fmax v17.4s, v17.4s, v15.4s\n"
+    "str q19, [x22, x12]\n"
+    "fmax v16.4s, v16.4s, v15.4s\n"
+    "str q18, [x21, x12]\n"
+    "ldr x20, [x17, #0x70]\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "ldr x19, [x17, #0x78]\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "str q17, [x20, x12]\n"
+    "str q16, [x19, x12]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x26, [x16, #0x20]\n"
+    "add x12, x12, #0x10\n"
+    "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "ldr x25, [x16, #0x28]\n"
+    "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "ldr x24, [x16, #0x30]\n"
+    "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+    "ldr x23, [x16, #0x38]\n"
+    "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+    "ldr x10, [x16, #0x40]\n"
+    "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+    "ldr x9, [x16, #0x48]\n"
+    "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "ldr x28, [x16, #0x50]\n"
+    "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "ldr x27, [x16, #0x58]\n"
+    "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x24, x14]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x26, x14]\n"
+    "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "ldr x26, [x16, #0x60]\n"
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "ldr x21, [x17, #0x8]\n"
+    "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x23, x14]\n"
+    "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x9, x14]\n"
+    "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v27.4s, v8.4s, v9.4s\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v26.4s, v7.4s, v9.4s\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v25.4s, v6.4s, v9.4s\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v23.4s, v5.4s, v9.4s\n"
+    "fmla v22.4s, v4.4s, v9.4s\n"
+    "fmla v21.4s, v3.4s, v9.4s\n"
+    "fmla v19.4s, v2.4s, v9.4s\n"
+    "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+    "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x28, x14]\n"
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x26, x14]\n"
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "fmla v24.4s, v6.4s, v10.4s\n"
+    "fmla v22.4s, v5.4s, v10.4s\n"
+    "fmla v21.4s, v4.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "fmla v18.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v23.4s, v6.4s, v11.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v19.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v2.4s, v10.4s\n"
+    "fmla v26.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x9, x14]\n"
+    "fmla v20.4s, v8.4s, v11.4s\n"
+    "ldr x9, [x16, #0xc8]\n"
+    "fmla v16.4s, v5.4s, v11.4s\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "fmla v24.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x27, x14]\n"
+    "fmla v19.4s, v7.4s, v11.4s\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla v18.4s, v6.4s, v11.4s\n"
+    "ldr q11, [x28, x14]\n"
+    "fmla v31.4s, v7.4s, v10.4s\n"
+    "ldr x28, [x16, #0xd0]\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "fmla v27.4s, v4.4s, v10.4s\n"
+    "fmla v26.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x26, x14]\n"
+    "fmla v17.4s, v8.4s, v11.4s\n"
+    "ldr x26, [x16, #0xe0]\n"
+    "fmla v16.4s, v7.4s, v11.4s\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "fmla v25.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v4.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v20.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x23, x14]\n"
+    "fmla v27.4s, v7.4s, v11.4s\n"
+    "ldr x23, [x16, #0xf8]\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v4.4s, v11.4s\n"
+    "fmla v22.4s, v3.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "fmla v18.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x10, x14]\n"
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "ldr x10, [x16, #0x100]\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "ldr q12, [x9, x14]\n"
+    "fmla v31.4s, v6.4s, v10.4s\n"
+    "ldr x9, [x16, #0x108]\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x28, x14]\n"
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "ldr x28, [x16, #0x110]\n"
+    "fmla v24.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v5.4s, v11.4s\n"
+    "fmla v20.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x27, x14]\n"
+    "fmla v28.4s, v8.4s, v12.4s\n"
+    "ldr x27, [x16, #0x118]\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "fmla v20.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x26, x14]\n"
+    "fmla v27.4s, v6.4s, v10.4s\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "fmla v19.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x25, x14]\n"
+    "fmla v22.4s, v7.4s, v11.4s\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "fmla v23.4s, v8.4s, v11.4s\n"
+    "fmla v19.4s, v5.4s, v11.4s\n"
+    "fmla v18.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x24, x14]\n"
+    "fmla v24.4s, v8.4s, v12.4s\n"
+    "fmla v20.4s, v5.4s, v12.4s\n"
+    "fmla v16.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x23, x14]\n"
+    "fmla v19.4s, v8.4s, v10.4s\n"
+    "fmla v18.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "ldr q10, [x10, x14]\n"
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "fmla v21.4s, v7.4s, v11.4s\n"
+    "fmla v20.4s, v6.4s, v11.4s\n"
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v17.4s, v4.4s, v11.4s\n"
+    "fmla v16.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x9, x14]\n"
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "fmla v17.4s, v7.4s, v12.4s\n"
+    "fmla v16.4s, v6.4s, v12.4s\n"
+    "ldr q12, [x28, x14]\n"
+    "fmla v30.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v1.4s, v10.4s\n"
+    "fmla v26.4s, v0.4s, v10.4s\n"
+    "ldr q10, [x27, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "fmla v23.4s, v7.4s, v12.4s\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "fmla v19.4s, v4.4s, v12.4s\n"
+    "fmla v18.4s, v3.4s, v12.4s\n"
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "fmla v20.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v5.4s, v10.4s\n"
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "str q31, [x22, x12]\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "str q30, [x21, x12]\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "str q29, [x20, x12]\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "str q28, [x19, x12]\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "str q27, [x22, x12]\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "str q26, [x21, x12]\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "str q25, [x20, x12]\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "ldr x21, [x17, #0x48]\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "ldr x20, [x17, #0x50]\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "str q24, [x19, x12]\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "str q23, [x22, x12]\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "str q22, [x21, x12]\n"
+    "fmax v19.4s, v19.4s, v15.4s\n"
+    "ldr x19, [x17, #0x58]\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "ldr x22, [x17, #0x60]\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "ldr x21, [x17, #0x68]\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "str q21, [x20, x12]\n"
+    "fmax v18.4s, v18.4s, v15.4s\n"
+    "str q20, [x19, x12]\n"
+    "fmax v17.4s, v17.4s, v15.4s\n"
+    "str q19, [x22, x12]\n"
+    "fmax v16.4s, v16.4s, v15.4s\n"
+    "ldr x20, [x17, #0x70]\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "ldr x19, [x17, #0x78]\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "str q18, [x21, x12]\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "str q17, [x20, x12]\n"
+    "str q16, [x19, x12]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 72f\n"
+    "ldr q13, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x12, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "ldr x10, [x16, #0x0]\n"
+    "add x10, x10, x14\n"
+    "ldr x9, [x16, #0x8]\n"
+    "ldr x28, [x16, #0x10]\n"
+    "add x9, x9, x14\n"
+    "ldr x27, [x16, #0x18]\n"
+    "add x28, x28, x14\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x10], #0x8\n"
+    "ld1 { v10.d }[0], [x9], #0x8\n"
+    "ld1 { v11.d }[0], [x28], #0x8\n"
+    "ld1 { v12.d }[0], [x27], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x10], #0x4\n"
+    "ld1 { v10.s }[2], [x9], #0x4\n"
+    "ld1 { v11.s }[2], [x28], #0x4\n"
+    "ld1 { v12.s }[2], [x27], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+    "ld1 { v9.s }[0], [x10], #0x4\n"
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "ld1 { v11.s }[0], [x28], #0x4\n"
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+    "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x26, [x16, #0x20]\n"
+    "add x26, x26, x14\n"
+    "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+    "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+    "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+    "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+    "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+    "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+    "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+    "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+    "fmla v30.4s, v8.4s, v12.4s\n"
+    "fmla v29.4s, v7.4s, v12.4s\n"
+    "fmla v26.4s, v5.4s, v12.4s\n"
+    "fmla v28.4s, v6.4s, v12.4s\n"
+    "fmla v25.4s, v4.4s, v12.4s\n"
+    "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+    "fmla v22.4s, v2.4s, v12.4s\n"
+    "fmla v21.4s, v1.4s, v12.4s\n"
+    "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v10.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (5, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "7:"  // Oddments: Load input (5, 0): Bit 1: End
+    "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n"
+    "ldr x25, [x16, #0x28]\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v11.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (5, 5): Bit 1: Unset
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "9:"  // Oddments: Load input (5, 5): Bit 1: End
+    "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n"
+    "ldr x24, [x16, #0x30]\n"
+    "add x24, x24, x14\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.s }[2], [x24], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x24], #0x4\n"
+    "11:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v27.4s, v8.4s, v9.4s\n"
+    "ldr x23, [x16, #0x38]\n"
+    "fmla v26.4s, v7.4s, v9.4s\n"
+    "add x23, x23, x14\n"
+    "fmla v25.4s, v6.4s, v9.4s\n"
+    "fmla v23.4s, v5.4s, v9.4s\n"
+    "fmla v22.4s, v4.4s, v9.4s\n"
+    "fmla v21.4s, v3.4s, v9.4s\n"
+    "fmla v19.4s, v2.4s, v9.4s\n"
+    "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n"
+    "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v12.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v12.s }[2], [x23], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (0, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "13:"  // Oddments: Load input (0, 1): Bit 1: End
+    "fmla v31.4s, v1.4s, v12.4s\n"
+    "ldr x10, [x16, #0x40]\n"
+    "fmla v30.4s, v0.4s, v12.4s\n"
+    "add x10, x10, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v11.d }[0], [x10], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v11.s }[2], [x10], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (0, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x10], #0x4\n"
+    "15:"  // Oddments: Load input (0, 4): Bit 1: End
+    "fmla v29.4s, v2.4s, v11.4s\n"
+    "ldr x9, [x16, #0x48]\n"
+    "fmla v28.4s, v1.4s, v11.4s\n"
+    "add x9, x9, x14\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v10.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v10.s }[2], [x9], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "17:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v26.4s, v8.4s, v10.4s\n"
+    "ldr x28, [x16, #0x50]\n"
+    "fmla v25.4s, v7.4s, v10.4s\n"
+    "add x28, x28, x14\n"
+    "fmla v24.4s, v6.4s, v10.4s\n"
+    "fmla v22.4s, v5.4s, v10.4s\n"
+    "fmla v21.4s, v4.4s, v10.4s\n"
+    "fmla v20.4s, v3.4s, v10.4s\n"
+    "fmla v18.4s, v2.4s, v10.4s\n"
+    "fmla v17.4s, v1.4s, v10.4s\n"
+    "fmla v16.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v9.d }[0], [x28], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v9.s }[2], [x28], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (1, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x28], #0x4\n"
+    "19:"  // Oddments: Load input (1, 0): Bit 1: End
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "ldr x27, [x16, #0x58]\n"
+    "fmla v27.4s, v0.4s, v9.4s\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v12.d }[0], [x27], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v12.s }[2], [x27], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (1, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "21:"  // Oddments: Load input (1, 5): Bit 1: End
+    "fmla v28.4s, v5.4s, v12.4s\n"
+    "ldr x26, [x16, #0x60]\n"
+    "fmla v24.4s, v2.4s, v12.4s\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v11.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v11.s }[2], [x26], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x26], #0x4\n"
+    "23:"  // Oddments: Load input (4, 0): Bit 1: End
+    "fmla v23.4s, v6.4s, v11.4s\n"
+    "ldr x25, [x16, #0x68]\n"
+    "fmla v19.4s, v3.4s, v11.4s\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v10.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v10.s }[2], [x25], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (1, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "25:"  // Oddments: Load input (1, 2): Bit 1: End
+    "fmla v31.4s, v5.4s, v10.4s\n"
+    "ldr x24, [x16, #0x70]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "add x24, x24, x14\n"
+    "fmla v29.4s, v3.4s, v10.4s\n"
+    "fmla v27.4s, v2.4s, v10.4s\n"
+    "fmla v26.4s, v1.4s, v10.4s\n"
+    "fmla v25.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v11.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v11.s }[2], [x24], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 5): Bit 1: Unset
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "27:"  // Oddments: Load input (4, 5): Bit 1: End
+    "fmla v20.4s, v8.4s, v11.4s\n"
+    "ldr x23, [x16, #0x78]\n"
+    "fmla v16.4s, v5.4s, v11.4s\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v12.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v12.s }[2], [x23], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "29:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr x10, [x16, #0x80]\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "add x10, x10, x14\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v26.4s, v2.4s, v12.4s\n"
+    "fmla v25.4s, v1.4s, v12.4s\n"
+    "fmla v24.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v11.d }[0], [x10], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v11.s }[2], [x10], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (5, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x10], #0x4\n"
+    "31:"  // Oddments: Load input (5, 1): Bit 1: End
+    "fmla v19.4s, v7.4s, v11.4s\n"
+    "ldr x9, [x16, #0x88]\n"
+    "fmla v18.4s, v6.4s, v11.4s\n"
+    "add x9, x9, x14\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v10.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v10.s }[2], [x9], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v10.s }[0], [x9], #0x4\n"
+    "33:"  // Oddments: Load input (2, 1): Bit 1: End
+    "fmla v31.4s, v7.4s, v10.4s\n"
+    "ldr x28, [x16, #0x90]\n"
+    "fmla v30.4s, v6.4s, v10.4s\n"
+    "add x28, x28, x14\n"
+    "fmla v27.4s, v4.4s, v10.4s\n"
+    "fmla v26.4s, v3.4s, v10.4s\n"
+    "fmla v23.4s, v1.4s, v10.4s\n"
+    "fmla v22.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v11.d }[0], [x28], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v11.s }[2], [x28], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (5, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x28], #0x4\n"
+    "35:"  // Oddments: Load input (5, 4): Bit 1: End
+    "fmla v17.4s, v8.4s, v11.4s\n"
+    "ldr x27, [x16, #0x98]\n"
+    "fmla v16.4s, v7.4s, v11.4s\n"
+    "add x27, x27, x14\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v12.d }[0], [x27], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v12.s }[2], [x27], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x27], #0x4\n"
+    "37:"  // Oddments: Load input (2, 4): Bit 1: End
+    "fmla v29.4s, v8.4s, v12.4s\n"
+    "ldr x26, [x16, #0xa0]\n"
+    "fmla v28.4s, v7.4s, v12.4s\n"
+    "add x26, x26, x14\n"
+    "fmla v25.4s, v5.4s, v12.4s\n"
+    "fmla v24.4s, v4.4s, v12.4s\n"
+    "fmla v21.4s, v2.4s, v12.4s\n"
+    "fmla v20.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v10.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (0, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "39:"  // Oddments: Load input (0, 2): Bit 1: End
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "ldr x25, [x16, #0xa8]\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "add x25, x25, x14\n"
+    "fmla v29.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v11.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "41:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v27.4s, v7.4s, v11.4s\n"
+    "ldr x24, [x16, #0xb0]\n"
+    "fmla v26.4s, v6.4s, v11.4s\n"
+    "add x24, x24, x14\n"
+    "fmla v23.4s, v4.4s, v11.4s\n"
+    "fmla v22.4s, v3.4s, v11.4s\n"
+    "fmla v19.4s, v1.4s, v11.4s\n"
+    "fmla v18.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v12.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (0, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "43:"  // Oddments: Load input (0, 3): Bit 1: End
+    "fmla v30.4s, v2.4s, v12.4s\n"
+    "ldr x23, [x16, #0xb8]\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "add x23, x23, x14\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v10.s }[2], [x23], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x23], #0x4\n"
+    "45:"  // Oddments: Load input (2, 0): Bit 1: End
+    "fmla v31.4s, v6.4s, v10.4s\n"
+    "ldr x10, [x16, #0xc0]\n"
+    "fmla v27.4s, v3.4s, v10.4s\n"
+    "add x10, x10, x14\n"
+    "fmla v23.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.d }[0], [x10], #0x8\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.s }[2], [x10], #0x4\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x10], #0x4\n"
+    "47:"  // Oddments: Load input (3, 4): Bit 1: End
+    "fmla v25.4s, v8.4s, v11.4s\n"
+    "ldr x9, [x16, #0xc8]\n"
+    "fmla v24.4s, v7.4s, v11.4s\n"
+    "add x9, x9, x14\n"
+    "fmla v21.4s, v5.4s, v11.4s\n"
+    "fmla v20.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v2.4s, v11.4s\n"
+    "fmla v16.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 49f\n"
+    "ld1 { v12.s }[2], [x9], #0x4\n"
+    "b 49f\n"
+    "48:"  // Oddments: Load input (2, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x9], #0x4\n"
+    "49:"  // Oddments: Load input (2, 5): Bit 1: End
+    "fmla v28.4s, v8.4s, v12.4s\n"
+    "ldr x28, [x16, #0xd0]\n"
+    "fmla v24.4s, v5.4s, v12.4s\n"
+    "add x28, x28, x14\n"
+    "fmla v20.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v10.d }[0], [x28], #0x8\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v10.s }[2], [x28], #0x4\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v10.s }[0], [x28], #0x4\n"
+    "51:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v27.4s, v6.4s, v10.4s\n"
+    "ldr x27, [x16, #0xd8]\n"
+    "fmla v23.4s, v3.4s, v10.4s\n"
+    "add x27, x27, x14\n"
+    "fmla v19.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.d }[0], [x27], #0x8\n"
+    "tbz %x[n_channels], #0, 53f\n"
+    "ld1 { v11.s }[2], [x27], #0x4\n"
+    "b 53f\n"
+    "52:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v11.s }[0], [x27], #0x4\n"
+    "53:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v23.4s, v8.4s, v11.4s\n"
+    "ldr x26, [x16, #0xe0]\n"
+    "fmla v22.4s, v7.4s, v11.4s\n"
+    "add x26, x26, x14\n"
+    "fmla v21.4s, v6.4s, v11.4s\n"
+    "fmla v19.4s, v5.4s, v11.4s\n"
+    "fmla v18.4s, v4.4s, v11.4s\n"
+    "fmla v17.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.s }[2], [x26], #0x4\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (3, 5): Bit 1: Unset
+    "ld1 { v12.s }[0], [x26], #0x4\n"
+    "55:"  // Oddments: Load input (3, 5): Bit 1: End
+    "fmla v24.4s, v8.4s, v12.4s\n"
+    "ldr x25, [x16, #0xe8]\n"
+    "fmla v20.4s, v5.4s, v12.4s\n"
+    "add x25, x25, x14\n"
+    "fmla v16.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v10.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 57f\n"
+    "ld1 { v10.s }[2], [x25], #0x4\n"
+    "b 57f\n"
+    "56:"  // Oddments: Load input (5, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "57:"  // Oddments: Load input (5, 2): Bit 1: End
+    "fmla v19.4s, v8.4s, v10.4s\n"
+    "ldr x24, [x16, #0xf0]\n"
+    "fmla v18.4s, v7.4s, v10.4s\n"
+    "add x24, x24, x14\n"
+    "fmla v17.4s, v6.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 58f\n"
+    "ld1 { v11.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "ld1 { v11.s }[2], [x24], #0x4\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "59:"  // Oddments: Load input (4, 3): Bit 1: End
+    "fmla v22.4s, v8.4s, v11.4s\n"
+    "ldr x23, [x16, #0xf8]\n"
+    "fmla v21.4s, v7.4s, v11.4s\n"
+    "add x23, x23, x14\n"
+    "fmla v20.4s, v6.4s, v11.4s\n"
+    "fmla v18.4s, v5.4s, v11.4s\n"
+    "fmla v17.4s, v4.4s, v11.4s\n"
+    "fmla v16.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 60f\n"
+    "ld1 { v12.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 61f\n"
+    "ld1 { v12.s }[2], [x23], #0x4\n"
+    "b 61f\n"
+    "60:"  // Oddments: Load input (5, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "61:"  // Oddments: Load input (5, 3): Bit 1: End
+    "fmla v18.4s, v8.4s, v12.4s\n"
+    "ldr x10, [x16, #0x100]\n"
+    "fmla v17.4s, v7.4s, v12.4s\n"
+    "add x10, x10, x14\n"
+    "fmla v16.4s, v6.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 62f\n"
+    "ld1 { v10.d }[0], [x10], #0x8\n"
+    "tbz %x[n_channels], #0, 63f\n"
+    "ld1 { v10.s }[2], [x10], #0x4\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load input (1, 1): Bit 1: Unset
+    "ld1 { v10.s }[0], [x10], #0x4\n"
+    "63:"  // Oddments: Load input (1, 1): Bit 1: End
+    "fmla v31.4s, v4.4s, v10.4s\n"
+    "ldr x9, [x16, #0x108]\n"
+    "fmla v30.4s, v3.4s, v10.4s\n"
+    "add x9, x9, x14\n"
+    "fmla v27.4s, v1.4s, v10.4s\n"
+    "fmla v26.4s, v0.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 64f\n"
+    "ld1 { v11.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 65f\n"
+    "ld1 { v11.s }[2], [x9], #0x4\n"
+    "b 65f\n"
+    "64:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x9], #0x4\n"
+    "65:"  // Oddments: Load input (1, 4): Bit 1: End
+    "fmla v29.4s, v5.4s, v11.4s\n"
+    "ldr x28, [x16, #0x110]\n"
+    "fmla v28.4s, v4.4s, v11.4s\n"
+    "add x28, x28, x14\n"
+    "fmla v25.4s, v2.4s, v11.4s\n"
+    "fmla v24.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 66f\n"
+    "ld1 { v12.d }[0], [x28], #0x8\n"
+    "tbz %x[n_channels], #0, 67f\n"
+    "ld1 { v12.s }[2], [x28], #0x4\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x28], #0x4\n"
+    "67:"  // Oddments: Load input (4, 1): Bit 1: End
+    "fmla v23.4s, v7.4s, v12.4s\n"
+    "ldr x27, [x16, #0x118]\n"
+    "fmla v22.4s, v6.4s, v12.4s\n"
+    "add x27, x27, x14\n"
+    "fmla v19.4s, v4.4s, v12.4s\n"
+    "fmla v18.4s, v3.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 68f\n"
+    "ld1 { v10.d }[0], [x27], #0x8\n"
+    "tbz %x[n_channels], #0, 69f\n"
+    "ld1 { v10.s }[2], [x27], #0x4\n"
+    "b 69f\n"
+    "68:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v10.s }[0], [x27], #0x4\n"
+    "69:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v21.4s, v8.4s, v10.4s\n"
+    "fmla v20.4s, v7.4s, v10.4s\n"
+    "fmla v17.4s, v5.4s, v10.4s\n"
+    "fmla v16.4s, v4.4s, v10.4s\n"
+    "fmax v31.4s, v31.4s, v15.4s\n"
+    "fmax v30.4s, v30.4s, v15.4s\n"
+    "fmax v29.4s, v29.4s, v15.4s\n"
+    "fmin v31.4s, v31.4s, v14.4s\n"
+    "fmin v30.4s, v30.4s, v14.4s\n"
+    "fmin v29.4s, v29.4s, v14.4s\n"
+    "fmax v28.4s, v28.4s, v15.4s\n"
+    "fmax v27.4s, v27.4s, v15.4s\n"
+    "fmax v26.4s, v26.4s, v15.4s\n"
+    "fmin v28.4s, v28.4s, v14.4s\n"
+    "fmin v27.4s, v27.4s, v14.4s\n"
+    "fmin v26.4s, v26.4s, v14.4s\n"
+    "fmax v25.4s, v25.4s, v15.4s\n"
+    "fmax v24.4s, v24.4s, v15.4s\n"
+    "fmax v23.4s, v23.4s, v15.4s\n"
+    "fmin v25.4s, v25.4s, v14.4s\n"
+    "fmin v24.4s, v24.4s, v14.4s\n"
+    "fmin v23.4s, v23.4s, v14.4s\n"
+    "fmax v22.4s, v22.4s, v15.4s\n"
+    "fmax v21.4s, v21.4s, v15.4s\n"
+    "fmax v20.4s, v20.4s, v15.4s\n"
+    "fmin v22.4s, v22.4s, v14.4s\n"
+    "fmin v21.4s, v21.4s, v14.4s\n"
+    "fmin v20.4s, v20.4s, v14.4s\n"
+    "fmax v19.4s, v19.4s, v15.4s\n"
+    "fmax v18.4s, v18.4s, v15.4s\n"
+    "fmax v17.4s, v17.4s, v15.4s\n"
+    "fmin v19.4s, v19.4s, v14.4s\n"
+    "fmin v18.4s, v18.4s, v14.4s\n"
+    "fmin v17.4s, v17.4s, v14.4s\n"
+    "fmax v16.4s, v16.4s, v15.4s\n"
+    "fmin v16.4s, v16.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 70f\n"
+    "ldr x22, [x17, #0x0]\n"
+    "ldr x21, [x17, #0x8]\n"
+    "add x22, x22, x12\n"
+    "ldr x20, [x17, #0x10]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.d }[0], [x22]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.d }[0], [x21]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.d }[0], [x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.d }[0], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.d }[0], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.d }[0], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.d }[0], [x19]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "add x21, x21, x12\n"
+    "st1 { v23.d }[0], [x22]\n"
+    "ldr x20, [x17, #0x50]\n"
+    "add x20, x20, x12\n"
+    "st1 { v22.d }[0], [x21]\n"
+    "ldr x19, [x17, #0x58]\n"
+    "add x19, x19, x12\n"
+    "st1 { v21.d }[0], [x20]\n"
+    "ldr x22, [x17, #0x60]\n"
+    "add x22, x22, x12\n"
+    "st1 { v20.d }[0], [x19]\n"
+    "ldr x21, [x17, #0x68]\n"
+    "add x21, x21, x12\n"
+    "st1 { v19.d }[0], [x22]\n"
+    "ldr x20, [x17, #0x70]\n"
+    "add x20, x20, x12\n"
+    "st1 { v18.d }[0], [x21]\n"
+    "ldr x19, [x17, #0x78]\n"
+    "add x19, x19, x12\n"
+    "st1 { v17.d }[0], [x20]\n"
+    "add x12, x12, #0x8\n"
+    "st1 { v16.d }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 71f\n"
+    "ldr x22, [x17, #0x0]\n"
+    "ldr x21, [x17, #0x8]\n"
+    "add x22, x22, x12\n"
+    "ldr x20, [x17, #0x10]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.s }[2], [x22]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.s }[2], [x21]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.s }[2], [x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.s }[2], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.s }[2], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.s }[2], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.s }[2], [x19]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "add x21, x21, x12\n"
+    "st1 { v23.s }[2], [x22]\n"
+    "ldr x20, [x17, #0x50]\n"
+    "add x20, x20, x12\n"
+    "st1 { v22.s }[2], [x21]\n"
+    "ldr x19, [x17, #0x58]\n"
+    "add x19, x19, x12\n"
+    "st1 { v21.s }[2], [x20]\n"
+    "ldr x22, [x17, #0x60]\n"
+    "add x22, x22, x12\n"
+    "st1 { v20.s }[2], [x19]\n"
+    "ldr x21, [x17, #0x68]\n"
+    "add x21, x21, x12\n"
+    "st1 { v19.s }[2], [x22]\n"
+    "ldr x20, [x17, #0x70]\n"
+    "add x20, x20, x12\n"
+    "st1 { v18.s }[2], [x21]\n"
+    "ldr x19, [x17, #0x78]\n"
+    "add x19, x19, x12\n"
+    "st1 { v17.s }[2], [x20]\n"
+    "st1 { v16.s }[2], [x19]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Store: Bit 1: Unset
+    "ldr x22, [x17, #0x0]\n"
+    "add x22, x22, x12\n"
+    "ldr x21, [x17, #0x8]\n"
+    "ldr x20, [x17, #0x10]\n"
+    "add x21, x21, x12\n"
+    "st1 { v31.s }[0], [x22]\n"
+    "ldr x19, [x17, #0x18]\n"
+    "add x20, x20, x12\n"
+    "st1 { v30.s }[0], [x21]\n"
+    "add x19, x19, x12\n"
+    "st1 { v29.s }[0], [x20]\n"
+    "ldr x22, [x17, #0x20]\n"
+    "add x22, x22, x12\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "ldr x21, [x17, #0x28]\n"
+    "add x21, x21, x12\n"
+    "st1 { v27.s }[0], [x22]\n"
+    "ldr x20, [x17, #0x30]\n"
+    "add x20, x20, x12\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x19, [x17, #0x38]\n"
+    "add x19, x19, x12\n"
+    "st1 { v25.s }[0], [x20]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "add x22, x22, x12\n"
+    "st1 { v24.s }[0], [x19]\n"
+    "ldr x21, [x17, #0x48]\n"
+    "add x21, x21, x12\n"
+    "st1 { v23.s }[0], [x22]\n"
+    "ldr x20, [x17, #0x50]\n"
+    "add x20, x20, x12\n"
+    "st1 { v22.s }[0], [x21]\n"
+    "ldr x19, [x17, #0x58]\n"
+    "add x19, x19, x12\n"
+    "st1 { v21.s }[0], [x20]\n"
+    "ldr x22, [x17, #0x60]\n"
+    "add x22, x22, x12\n"
+    "st1 { v20.s }[0], [x19]\n"
+    "ldr x21, [x17, #0x68]\n"
+    "add x21, x21, x12\n"
+    "st1 { v19.s }[0], [x22]\n"
+    "ldr x20, [x17, #0x70]\n"
+    "add x20, x20, x12\n"
+    "st1 { v18.s }[0], [x21]\n"
+    "ldr x19, [x17, #0x78]\n"
+    "add x19, x19, x12\n"
+    "st1 { v17.s }[0], [x20]\n"
+    "st1 { v16.s }[0], [x19]\n"
+    "71:"  // Oddments: Store: Bit 1: End
+
+    "72:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..8eb5605
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..4466ec1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x6, #0x0\n"
+    "mov x27, #0x0\n"
+    "1:"  // Tile loop
+    "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x4\n"
+    "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x25, #0x2\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x24, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x22, #0x0\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x6, x23\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x27, x8, x19\n" // offset += tile_j * ld_input_col
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+    "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x17, x17, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1r { v19.4s }, [x24]\n"
+    "add x14, x17, x23, LSL #2\n"
+    "ld1r { v18.4s }, [x21]\n"
+    "add x13, x14, x23, LSL #2\n"
+    "lsl x8, x8, #0x2\n"
+    "add x12, x13, x23, LSL #2\n"
+    "add x11, x12, x23, LSL #2\n"
+    "add x10, x8, x8\n"
+    "add x9, x10, x8\n"
+    "add x28, x9, x8\n"
+    "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x27, x16, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x25\n" // offset *= output_tile_size
+    "add x15, x15, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "add x27, x15, x20, LSL #2\n"
+    "lsl x16, x16, #0x2\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x2\n"
+    "cbz x19, 4f\n"
+    "ldr q17, [x7, #0x0]\n"
+    "ldr q0, [x7, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x7, #0x20]\n"
+    "ldr q2, [x7, #0x30]\n"
+    "ldr q3, [x7, #0x40]\n"
+    "ldr q4, [x7, #0x50]\n"
+    "ldr q5, [x7, #0x60]\n"
+    "ldr q6, [x7, #0x70]\n"
+    "ldr q7, [x7, #0x80]\n"
+    "ldr q8, [x7, #0x90]\n"
+    "add x7, x7, #0xa0\n"
+    "ldr q9, [x13, x10]\n"
+    "ld1 { v10.4s }, [x17]\n"
+    "ldr q11, [x17, x8]\n"
+    "ldr q12, [x17, x9]\n"
+    "ldr q13, [x17, x28]\n"
+    "ld1 { v14.4s }, [x14]\n"
+    "ldr q15, [x14, x8]\n"
+    "ldr q16, [x17, x10]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+    "add x22, x22, #0x10\n"
+    "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "add x17, x17, #0x10\n"
+    "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "ldr q17, [x7, #0x0]\n"
+    "add x21, x21, #0x10\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ld1 { v10.4s }, [x17]\n"
+    "cmp x21, x19, LSL #4\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x14, x28]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x14, x9]\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x14, x10]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.4s, v3.4s, v14.4s\n"
+    "ld1 { v14.4s }, [x12]\n"
+    "fmla v30.4s, v0.4s, v16.4s\n"
+    "fmla v31.4s, v4.4s, v15.4s\n"
+    "ld1 { v15.4s }, [x13]\n"
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ldr q14, [x12, x28]\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "ldr q11, [x12, x8]\n"
+    "fmla v31.4s, v2.4s, v16.4s\n"
+    "ldr q16, [x13, x8]\n"
+    "fmla v29.4s, v0.4s, v15.4s\n"
+    "ldr q0, [x7, #0x10]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q12, [x13, x9]\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "ldr q11, [x13, x28]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "ldr q13, [x12, x9]\n"
+    "ldr q9, [x13, x10]\n"
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "ld1 { v15.4s }, [x11]\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x11, x8]\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "ldr q4, [x7, #0x50]\n"
+    "fmla v31.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x12, x10]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.4s, v6.4s, v15.4s\n"
+    "ldr q15, [x11, x10]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x17, x9]\n"
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "ldr q1, [x7, #0x20]\n"
+    "fmax v31.4s, v31.4s, v19.4s\n"
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "ldr q13, [x17, x28]\n"
+    "fmla v28.4s, v5.4s, v14.4s\n"
+    "ldr q14, [x11, x9]\n"
+    "fmax v30.4s, v30.4s, v19.4s\n"
+    "fmin v31.4s, v31.4s, v18.4s\n"
+    "st1 { v31.4s }, [x15]\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "ldr q11, [x11, x28]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v30.4s, v30.4s, v18.4s\n"
+    "ldr q2, [x7, #0x30]\n"
+    "ldr q5, [x7, #0x60]\n"
+    "fmla v28.4s, v3.4s, v16.4s\n"
+    "ldr q16, [x17, x10]\n"
+    "fmla v29.4s, v8.4s, v15.4s\n"
+    "str q30, [x15, x16]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v28.4s, v7.4s, v14.4s\n"
+    "ld1 { v14.4s }, [x14]\n"
+    "fmax v29.4s, v29.4s, v19.4s\n"
+    "ldr q3, [x7, #0x40]\n"
+    "ldr q7, [x7, #0x80]\n"
+    "fmin v29.4s, v29.4s, v18.4s\n"
+    "st1 { v29.4s }, [x27]\n"
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "ldr q15, [x14, x8]\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x17, x8]\n"
+    "ldr q6, [x7, #0x70]\n"
+    "fmax v28.4s, v28.4s, v19.4s\n"
+    "ldr q8, [x7, #0x90]\n"
+    "add x7, x7, #0xa0\n"
+    "fmin v28.4s, v28.4s, v18.4s\n"
+    "str q28, [x27, x16]\n"
+    "add x27, x27, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "add x17, x17, #0x10\n"
+    "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+    "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x14, x28]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x14, x9]\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x14, x10]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v31.4s, v3.4s, v14.4s\n"
+    "ld1 { v14.4s }, [x12]\n"
+    "fmla v30.4s, v0.4s, v16.4s\n"
+    "fmla v31.4s, v4.4s, v15.4s\n"
+    "ld1 { v15.4s }, [x13]\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "ldr q11, [x12, x8]\n"
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ldr q14, [x12, x28]\n"
+    "fmla v31.4s, v2.4s, v16.4s\n"
+    "ldr q16, [x13, x8]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q12, [x13, x9]\n"
+    "fmla v29.4s, v0.4s, v15.4s\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "ldr q13, [x12, x9]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "ldr q11, [x13, x28]\n"
+    "add x13, x13, #0x10\n"
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "ld1 { v15.4s }, [x11]\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x11, x8]\n"
+    "fmla v31.4s, v7.4s, v16.4s\n"
+    "ldr q16, [x12, x10]\n"
+    "add x12, x12, #0x10\n"
+    "fmla v29.4s, v6.4s, v15.4s\n"
+    "ldr q15, [x11, x10]\n"
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmax v31.4s, v31.4s, v19.4s\n"
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "fmax v30.4s, v30.4s, v19.4s\n"
+    "fmla v28.4s, v5.4s, v14.4s\n"
+    "ldr q14, [x11, x9]\n"
+    "fmin v31.4s, v31.4s, v18.4s\n"
+    "st1 { v31.4s }, [x15]\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "ldr q11, [x11, x28]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v30.4s, v30.4s, v18.4s\n"
+    "str q30, [x15, x16]\n"
+    "fmla v28.4s, v3.4s, v16.4s\n"
+    "add x15, x15, #0x10\n"
+    "fmla v29.4s, v8.4s, v15.4s\n"
+    "fmla v28.4s, v7.4s, v14.4s\n"
+    "fmax v29.4s, v29.4s, v19.4s\n"
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "fmin v29.4s, v29.4s, v18.4s\n"
+    "st1 { v29.4s }, [x27]\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "fmax v28.4s, v28.4s, v19.4s\n"
+    "fmin v28.4s, v28.4s, v18.4s\n"
+    "str q28, [x27, x16]\n"
+    "add x27, x27, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 43f\n"
+    "ldr q17, [x7, #0x0]\n"
+    "ldr q0, [x7, #0x10]\n"
+    "add x26, x13, x10\n"
+    "ldr q1, [x7, #0x20]\n"
+    "add x25, x17, XZR\n"
+    "ldr q2, [x7, #0x30]\n"
+    "add x24, x17, x8\n"
+    "ldr q3, [x7, #0x40]\n"
+    "add x23, x17, x9\n"
+    "ldr q4, [x7, #0x50]\n"
+    "add x22, x17, x28\n"
+    "ldr q5, [x7, #0x60]\n"
+    "add x21, x14, XZR\n"
+    "ldr q6, [x7, #0x70]\n"
+    "add x20, x14, x8\n"
+    "ldr q7, [x7, #0x80]\n"
+    "add x19, x17, x10\n"
+    "ldr q8, [x7, #0x90]\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d9, [x26], #0x8\n"
+    "ldr d10, [x25], #0x8\n"
+    "ldr d11, [x24], #0x8\n"
+    "ldr d12, [x23], #0x8\n"
+    "ldr d13, [x22], #0x8\n"
+    "ldr d14, [x21], #0x8\n"
+    "ldr d15, [x20], #0x8\n"
+    "ldr d16, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v9.s }[2], [x26]\n"
+    "ld1 { v10.s }[2], [x25]\n"
+    "ld1 { v11.s }[2], [x24]\n"
+    "ld1 { v12.s }[2], [x23]\n"
+    "ld1 { v13.s }[2], [x22]\n"
+    "ld1 { v14.s }[2], [x21]\n"
+    "ld1 { v15.s }[2], [x20]\n"
+    "ld1 { v16.s }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+    "ldr s9, [x26, #0x0]\n"
+    "ldr s10, [x25, #0x0]\n"
+    "ldr s11, [x24, #0x0]\n"
+    "ldr s12, [x23, #0x0]\n"
+    "ldr s13, [x22, #0x0]\n"
+    "ldr s14, [x21, #0x0]\n"
+    "ldr s15, [x20, #0x0]\n"
+    "ldr s16, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+    "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "add x19, x14, x9\n"
+    "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+    "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "fmla v31.4s, v3.4s, v14.4s\n"
+    "fmla v30.4s, v0.4s, v16.4s\n"
+    "fmla v31.4s, v4.4s, v15.4s\n"
+    "fmla v31.4s, v2.4s, v16.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "add x19, x14, x28\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "add x19, x14, x10\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "add x19, x12, XZR\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d14, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v14.s }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s14, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "add x19, x13, XZR\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d15, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v15.s }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+    "ldr s15, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "add x19, x12, x8\n"
+    "fmla v29.4s, v0.4s, v15.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "add x19, x13, x8\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d16, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v16.s }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s16, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v31.4s, v7.4s, v16.4s\n"
+    "add x19, x12, x9\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "add x19, x13, x9\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "add x19, x12, x28\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d14, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v14.s }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s14, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v28.4s, v5.4s, v14.4s\n"
+    "add x19, x11, XZR\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d15, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v15.s }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s15, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v29.4s, v6.4s, v15.4s\n"
+    "add x19, x13, x28\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "add x19, x11, x8\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "add x19, x12, x10\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d16, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v16.s }[2], [x19]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s16, [x19, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "add x19, x11, x9\n"
+    "fmla v28.4s, v3.4s, v16.4s\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d14, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v14.s }[2], [x19]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s14, [x19, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v28.4s, v7.4s, v14.4s\n"
+    "add x19, x11, x10\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d15, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v15.s }[2], [x19]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s15, [x19, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v29.4s, v8.4s, v15.4s\n"
+    "add x19, x11, x28\n"
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "fmax v31.4s, v31.4s, v19.4s\n"
+    "fmax v30.4s, v30.4s, v19.4s\n"
+    "fmax v29.4s, v29.4s, v19.4s\n"
+    "fmin v31.4s, v31.4s, v18.4s\n"
+    "fmin v30.4s, v30.4s, v18.4s\n"
+    "fmin v29.4s, v29.4s, v18.4s\n"
+    "fmax v28.4s, v28.4s, v19.4s\n"
+    "fmin v28.4s, v28.4s, v18.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "mov x19, x15\n"
+    "st1 { v31.d }[0], [x19], x16\n"
+    "add x15, x15, #0x8\n"
+    "st1 { v30.d }[0], [x19]\n"
+    "mov x19, x27\n"
+    "st1 { v29.d }[0], [x19], x16\n"
+    "add x27, x27, #0x8\n"
+    "st1 { v28.d }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "mov x20, x15\n"
+    "st1 { v31.s }[2], [x20], x16\n"
+    "mov x19, x27\n"
+    "st1 { v30.s }[2], [x20]\n"
+    "st1 { v29.s }[2], [x19], x16\n"
+    "st1 { v28.s }[2], [x19]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x20, x15\n"
+    "st1 { v31.s }[0], [x20], x16\n"
+    "mov x19, x27\n"
+    "st1 { v30.s }[0], [x20]\n"
+    "st1 { v29.s }[0], [x19], x16\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "42:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "43:"  // Tile loop: End
+    "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x6, #0x1\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x27, x27, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x27, x19\n"
+    "csel x27, x27, XZR, LT\n"
+    "csel x6, x6, x21, LT\n"
+    "cmp x6, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..a515301
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v19.4s }, [x20]\n"
+    "ld1r { v18.4s }, [x19]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x21, #0x0]\n"
+    "mov x11, #0x10\n" // cntb _, ALL, #1
+    "ldp x10, x9, [x21, #0x10]\n"
+    "sub x28, XZR, x11\n"
+    "lsr x27, %x[n_channels], #0x2\n"
+    "cbz x27, 3f\n"
+    "ldr q17, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x11, x27, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr q10, [x25, x14]\n"
+    "ldr q11, [x24, x14]\n"
+    "ldr q12, [x23, x14]\n"
+    "ldr q13, [x22, x14]\n"
+    "ldr q14, [x21, x14]\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "ldr q15, [x20, x14]\n"
+    "ldr q16, [x19, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x26, [x16, #0x40]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+    "ldr x25, [x16, #0x48]\n"
+    "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "ldr x24, [x16, #0x50]\n"
+    "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x25, x14]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x26, x14]\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x24, x14]\n"
+    "fmla v31.4s, v3.4s, v14.4s\n"
+    "ldr q14, [x23, x14]\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v30.4s, v0.4s, v16.4s\n"
+    "ldr x26, [x16, #0x80]\n"
+    "fmla v31.4s, v4.4s, v15.4s\n"
+    "ldr q15, [x22, x14]\n"
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr x24, [x16, #0x90]\n"
+    "fmla v31.4s, v2.4s, v16.4s\n"
+    "ldr q16, [x20, x14]\n"
+    "fmla v29.4s, v0.4s, v15.4s\n"
+    "ldr q14, [x25, x14]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q12, [x26, x14]\n"
+    "ldr x23, [x16, #0x98]\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "ldr q11, [x23, x14]\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "ldr q13, [x19, x14]\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "ldr q15, [x24, x14]\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x22, x14]\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v31.4s, v7.4s, v16.4s\n"
+    "fmla v29.4s, v6.4s, v15.4s\n"
+    "ldr q16, [x21, x14]\n"
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "ldr q15, [x19, x14]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q17, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v19.4s\n"
+    "ldr q1, [x15, #0x20]\n"
+    "fmax v30.4s, v30.4s, v19.4s\n"
+    "ldr q4, [x15, #0x50]\n"
+    "fmla v28.4s, v5.4s, v14.4s\n"
+    "ldr q14, [x20, x14]\n"
+    "fmin v31.4s, v31.4s, v18.4s\n"
+    "str q31, [x13, x28]\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "ldr q11, [x26, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmin v30.4s, v30.4s, v18.4s\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "fmla v28.4s, v3.4s, v16.4s\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "fmla v29.4s, v8.4s, v15.4s\n"
+    "ldr q9, [x26, x11]\n"
+    "ldr q10, [x25, x11]\n"
+    "fmla v28.4s, v7.4s, v14.4s\n"
+    "ldr q12, [x23, x11]\n"
+    "fmax v29.4s, v29.4s, v19.4s\n"
+    "ldr q13, [x22, x11]\n"
+    "ldr q14, [x21, x11]\n"
+    "fmin v29.4s, v29.4s, v18.4s\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "str q30, [x12, x28]\n"
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "ldr q2, [x15, #0x30]\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "ldr q11, [x24, x11]\n"
+    "ldr q15, [x20, x11]\n"
+    "fmax v28.4s, v28.4s, v19.4s\n"
+    "ldr q16, [x19, x11]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v28.4s, v28.4s, v18.4s\n"
+    "str q29, [x10, x28]\n"
+    "cmp x11, x27, LSL #4\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "str q28, [x9, x28]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "add x15, x15, #0xa0\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x26, [x16, #0x40]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+    "ldr x25, [x16, #0x48]\n"
+    "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "ldr x24, [x16, #0x50]\n"
+    "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x25, x14]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x26, x14]\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x24, x14]\n"
+    "fmla v31.4s, v3.4s, v14.4s\n"
+    "ldr q14, [x23, x14]\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v30.4s, v0.4s, v16.4s\n"
+    "ldr x26, [x16, #0x80]\n"
+    "fmla v31.4s, v4.4s, v15.4s\n"
+    "ldr q15, [x22, x14]\n"
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr x24, [x16, #0x90]\n"
+    "fmla v31.4s, v2.4s, v16.4s\n"
+    "ldr q16, [x20, x14]\n"
+    "fmla v29.4s, v0.4s, v15.4s\n"
+    "ldr q14, [x25, x14]\n"
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr q12, [x26, x14]\n"
+    "ldr x23, [x16, #0x98]\n"
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "ldr q11, [x23, x14]\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "ldr q13, [x19, x14]\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "ldr q15, [x24, x14]\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x22, x14]\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v31.4s, v7.4s, v16.4s\n"
+    "fmla v29.4s, v6.4s, v15.4s\n"
+    "ldr q16, [x21, x14]\n"
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "ldr q15, [x19, x14]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "fmax v31.4s, v31.4s, v19.4s\n"
+    "fmax v30.4s, v30.4s, v19.4s\n"
+    "fmla v28.4s, v5.4s, v14.4s\n"
+    "ldr q14, [x20, x14]\n"
+    "fmin v31.4s, v31.4s, v18.4s\n"
+    "str q31, [x13, x28]\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "ldr q11, [x26, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmin v30.4s, v30.4s, v18.4s\n"
+    "str q30, [x12, x28]\n"
+    "fmla v28.4s, v3.4s, v16.4s\n"
+    "fmla v29.4s, v8.4s, v15.4s\n"
+    "fmla v28.4s, v7.4s, v14.4s\n"
+    "fmax v29.4s, v29.4s, v19.4s\n"
+    "fmin v29.4s, v29.4s, v18.4s\n"
+    "str q29, [x10, x28]\n"
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "fmax v28.4s, v28.4s, v19.4s\n"
+    "fmin v28.4s, v28.4s, v18.4s\n"
+    "str q28, [x9, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 42f\n"
+    "ldr q17, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x28, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "add x13, x13, x28\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x12, x12, x28\n"
+    "ldr q3, [x15, #0x40]\n"
+    "add x10, x10, x28\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x9, x9, x28\n"
+    "ldr q5, [x15, #0x60]\n"
+    "ldr q6, [x15, #0x70]\n"
+    "ldr q7, [x15, #0x80]\n"
+    "ldr q8, [x15, #0x90]\n"
+    "ldr x26, [x16, #0x0]\n"
+    "ldr x25, [x16, #0x8]\n"
+    "ldr x24, [x16, #0x10]\n"
+    "add x26, x26, x14\n"
+    "ldr x23, [x16, #0x18]\n"
+    "add x25, x25, x14\n"
+    "ldr x22, [x16, #0x20]\n"
+    "add x24, x24, x14\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x23, x23, x14\n"
+    "ldr x20, [x16, #0x30]\n"
+    "add x22, x22, x14\n"
+    "ldr x19, [x16, #0x38]\n"
+    "add x21, x21, x14\n"
+    "add x20, x20, x14\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v9.d }[0], [x26], #0x8\n"
+    "ld1 { v10.d }[0], [x25], #0x8\n"
+    "ld1 { v11.d }[0], [x24], #0x8\n"
+    "ld1 { v12.d }[0], [x23], #0x8\n"
+    "ld1 { v13.d }[0], [x22], #0x8\n"
+    "ld1 { v14.d }[0], [x21], #0x8\n"
+    "ld1 { v15.d }[0], [x20], #0x8\n"
+    "ld1 { v16.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v9.s }[2], [x26], #0x4\n"
+    "ld1 { v10.s }[2], [x25], #0x4\n"
+    "ld1 { v11.s }[2], [x24], #0x4\n"
+    "ld1 { v12.s }[2], [x23], #0x4\n"
+    "ld1 { v13.s }[2], [x22], #0x4\n"
+    "ld1 { v14.s }[2], [x21], #0x4\n"
+    "ld1 { v15.s }[2], [x20], #0x4\n"
+    "ld1 { v16.s }[2], [x19], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "ld1 { v10.s }[0], [x25], #0x4\n"
+    "ld1 { v11.s }[0], [x24], #0x4\n"
+    "ld1 { v12.s }[0], [x23], #0x4\n"
+    "ld1 { v13.s }[0], [x22], #0x4\n"
+    "ld1 { v14.s }[0], [x21], #0x4\n"
+    "ld1 { v15.s }[0], [x20], #0x4\n"
+    "ld1 { v16.s }[0], [x19], #0x4\n"
+    "5:"  // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+    "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n"
+    "ldr x26, [x16, #0x40]\n"
+    "add x26, x26, x14\n"
+    "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n"
+    "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+    "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+    "fmla v31.4s, v0.4s, v10.4s\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "fmla v30.4s, v2.4s, v13.4s\n"
+    "fmla v31.4s, v3.4s, v14.4s\n"
+    "fmla v30.4s, v0.4s, v16.4s\n"
+    "fmla v31.4s, v4.4s, v15.4s\n"
+    "fmla v31.4s, v2.4s, v16.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v11.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v11.s }[2], [x26], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x26], #0x4\n"
+    "7:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v30.4s, v4.4s, v11.4s\n"
+    "ldr x25, [x16, #0x48]\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v12.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v12.s }[2], [x25], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x25], #0x4\n"
+    "9:"  // Oddments: Load input (1, 4): Bit 1: End
+    "fmla v30.4s, v5.4s, v12.4s\n"
+    "ldr x24, [x16, #0x50]\n"
+    "add x24, x24, x14\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v13.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v13.s }[2], [x24], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (1, 2): Bit 1: Unset
+    "ld1 { v13.s }[0], [x24], #0x4\n"
+    "11:"  // Oddments: Load input (1, 2): Bit 1: End
+    "fmla v31.4s, v5.4s, v13.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v14.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v14.s }[2], [x23], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v14.s }[0], [x23], #0x4\n"
+    "13:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v29.4s, v3.4s, v14.4s\n"
+    "ldr x22, [x16, #0x60]\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v15.d }[0], [x22], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v15.s }[2], [x22], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 0): Bit 1: Unset
+    "ld1 { v15.s }[0], [x22], #0x4\n"
+    "15:"  // Oddments: Load input (2, 0): Bit 1: End
+    "fmla v31.4s, v6.4s, v15.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v29.4s, v0.4s, v15.4s\n"
+    "add x21, x21, x14\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v11.d }[0], [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "17:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v29.4s, v4.4s, v11.4s\n"
+    "ldr x20, [x16, #0x70]\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v16.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v16.s }[2], [x20], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v16.s }[0], [x20], #0x4\n"
+    "19:"  // Oddments: Load input (2, 1): Bit 1: End
+    "fmla v31.4s, v7.4s, v16.4s\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v29.4s, v1.4s, v16.4s\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v13.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v13.s }[2], [x19], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v13.s }[0], [x19], #0x4\n"
+    "21:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v28.4s, v4.4s, v13.4s\n"
+    "ldr x26, [x16, #0x80]\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v12.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v12.s }[2], [x26], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v12.s }[0], [x26], #0x4\n"
+    "23:"  // Oddments: Load input (2, 3): Bit 1: End
+    "fmla v30.4s, v7.4s, v12.4s\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v14.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v14.s }[2], [x25], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v14.s }[0], [x25], #0x4\n"
+    "25:"  // Oddments: Load input (3, 4): Bit 1: End
+    "fmla v28.4s, v5.4s, v14.4s\n"
+    "ldr x24, [x16, #0x90]\n"
+    "add x24, x24, x14\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v15.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v15.s }[2], [x24], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v15.s }[0], [x24], #0x4\n"
+    "27:"  // Oddments: Load input (4, 0): Bit 1: End
+    "fmla v29.4s, v6.4s, v15.4s\n"
+    "ldr x23, [x16, #0x98]\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v11.s }[2], [x23], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x23], #0x4\n"
+    "29:"  // Oddments: Load input (2, 4): Bit 1: End
+    "fmla v30.4s, v8.4s, v11.4s\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v13.d }[0], [x22], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v13.s }[2], [x22], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x22], #0x4\n"
+    "31:"  // Oddments: Load input (4, 1): Bit 1: End
+    "fmla v29.4s, v7.4s, v13.4s\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "add x21, x21, x14\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v16.d }[0], [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v16.s }[2], [x21], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v16.s }[0], [x21], #0x4\n"
+    "33:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v29.4s, v5.4s, v16.4s\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v28.4s, v3.4s, v16.4s\n"
+    "add x20, x20, x14\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v14.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v14.s }[2], [x20], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v14.s }[0], [x20], #0x4\n"
+    "35:"  // Oddments: Load input (4, 3): Bit 1: End
+    "fmla v28.4s, v7.4s, v14.4s\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v15.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v15.s }[2], [x19], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v15.s }[0], [x19], #0x4\n"
+    "37:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v29.4s, v8.4s, v15.4s\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v28.4s, v6.4s, v15.4s\n"
+    "add x26, x26, x14\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v11.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v11.s }[2], [x26], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v11.s }[0], [x26], #0x4\n"
+    "39:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v28.4s, v8.4s, v11.4s\n"
+    "fmax v31.4s, v31.4s, v19.4s\n"
+    "fmax v30.4s, v30.4s, v19.4s\n"
+    "fmax v29.4s, v29.4s, v19.4s\n"
+    "fmin v31.4s, v31.4s, v18.4s\n"
+    "fmin v30.4s, v30.4s, v18.4s\n"
+    "fmin v29.4s, v29.4s, v18.4s\n"
+    "fmax v28.4s, v28.4s, v19.4s\n"
+    "fmin v28.4s, v28.4s, v18.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "st1 { v31.d }[0], [x13], #0x8\n"
+    "st1 { v30.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x10], #0x8\n"
+    "st1 { v28.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "st1 { v31.s }[2], [x13], #0x4\n"
+    "st1 { v30.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x10], #0x4\n"
+    "st1 { v28.s }[2], [x9], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v31.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x10], #0x4\n"
+    "st1 { v28.s }[0], [x9], #0x4\n"
+    "41:"  // Oddments: Store: Bit 1: End
+
+    "42:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..314fe76
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  indirect_kern_type indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..170eb22
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,969 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "mov x28, #0x0\n"
+    "mov x27, #0x0\n"
+    "1:"  // Tile loop
+    "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x26, #0x2\n"
+    "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x25, #0x2\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x24, %x[params_struct], %[offsetof_args_min]\n"
+    "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "add x21, %x[params_struct], %[offsetof_args_max]\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "mov x22, #0x0\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x28, x23\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x27, x4, x19\n" // offset += tile_j * ld_input_col
+    "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+    "ldr x7, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x5, x5, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1r { v18.4s }, [x24]\n"
+    "add x8, x5, x23, LSL #2\n"
+    "ld1r { v17.4s }, [x21]\n"
+    "add x17, x8, x23, LSL #2\n"
+    "lsl x4, x4, #0x2\n"
+    "add x16, x17, x23, LSL #2\n"
+    "add x15, x16, x23, LSL #2\n"
+    "add x14, x15, x23, LSL #2\n"
+    "add x13, x4, x4\n"
+    "add x12, x13, x4\n"
+    "add x11, x12, x4\n"
+    "add x10, x11, x4\n"
+    "mul x19, x28, x20\n" // offset = tile_i * ld_output_row
+    "madd x19, x27, x6, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x25\n" // offset *= output_tile_size
+    "add x7, x7, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "add x9, x7, x20, LSL #2\n"
+    "lsl x6, x6, #0x2\n"
+    "mov x21, #0x10\n" // cntb _, ALL, #1
+    "sub x20, XZR, x21\n"
+    "lsr x19, %x[n_channels], #0x2\n"
+    "cbz x19, 4f\n"
+    "ldr q16, [x3, #0x0]\n"
+    "ldr q0, [x3, #0x10]\n"
+    "cmp x21, x19, LSL #4\n"
+    "ldr q1, [x3, #0x20]\n"
+    "ldr q2, [x3, #0x30]\n"
+    "ldr q3, [x3, #0x40]\n"
+    "ldr q4, [x3, #0x50]\n"
+    "add x3, x3, #0x60\n"
+    "ld1 { v5.4s }, [x5]\n"
+    "ldr q6, [x5, x4]\n"
+    "ld1 { v7.4s }, [x8]\n"
+    "ldr q8, [x8, x4]\n"
+    "ldr q9, [x5, x13]\n"
+    "ldr q13, [x8, x13]\n"
+    "ldr q11, [x5, x12]\n"
+    "ldr q12, [x5, x11]\n"
+    "ldr q10, [x8, x10]\n"
+    "ld1 { v14.4s }, [x17]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr q5, [x8, x12]\n"
+    "add x20, x20, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+    "add x22, x22, #0x10\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+    "add x21, x21, #0x10\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+    "ldr q0, [x3, #0x0]\n"
+    "cmp x21, x19, LSL #4\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q6, [x8, x11]\n"
+    "add x8, x8, #0x10\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "ldr q16, [x3, #0x140]\n"
+    "fmla v29.4s, v1.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "ldr q1, [x3, #0x10]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x5, x10]\n"
+    "add x5, x5, #0x10\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "ldr q2, [x3, #0x20]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x17, x4]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v5.4s\n"
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "ldr q3, [x3, #0x30]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x17, x13]\n"
+    "fmla v30.4s, v4.4s, v9.4s\n"
+    "ldr q9, [x17, x12]\n"
+    "fmla v29.4s, v4.4s, v6.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x3, #0x40]\n"
+    "fmla v31.4s, v0.4s, v7.4s\n"
+    "ld1 { v7.4s }, [x8]\n"
+    "fmla v30.4s, v0.4s, v8.4s\n"
+    "fmla v29.4s, v0.4s, v14.4s\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr q0, [x3, #0x50]\n"
+    "fmla v31.4s, v1.4s, v8.4s\n"
+    "ldr q8, [x17, x10]\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q1, [x3, #0x60]\n"
+    "fmla v31.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x17, x11]\n"
+    "add x17, x17, #0x10\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "ldr q2, [x3, #0x70]\n"
+    "fmla v31.4s, v3.4s, v5.4s\n"
+    "ld1 { v5.4s }, [x16]\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "ldr q3, [x3, #0x80]\n"
+    "fmla v31.4s, v4.4s, v6.4s\n"
+    "ldr q6, [x16, x4]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "ldr q10, [x16, x13]\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "ldr q4, [x3, #0x90]\n"
+    "fmla v31.4s, v0.4s, v14.4s\n"
+    "ldr q14, [x16, x10]\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v5.4s\n"
+    "fmla v28.4s, v0.4s, v6.4s\n"
+    "ldr q0, [x3, #0xa0]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x16, x12]\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v10.4s\n"
+    "ldr q1, [x3, #0xb0]\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x16, x11]\n"
+    "add x16, x16, #0x10\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x3, #0xc0]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x15]\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr q3, [x3, #0xd0]\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x15, x4]\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "ldr q8, [x15, x11]\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v14.4s\n"
+    "ldr q4, [x3, #0xe0]\n"
+    "fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr q5, [x15, x13]\n"
+    "fmla v30.4s, v0.4s, v6.4s\n"
+    "fmla v29.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "ldr q0, [x3, #0xf0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q6, [x15, x12]\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v5.4s\n"
+    "ldr q1, [x3, #0x100]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "ldr q10, [x15, x10]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "fmla v28.4s, v2.4s, v6.4s\n"
+    "ldr q2, [x3, #0x110]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ld1 { v11.4s }, [x14]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "fmla v28.4s, v3.4s, v8.4s\n"
+    "ldr q3, [x3, #0x120]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x14, x4]\n"
+    "fmla v30.4s, v4.4s, v14.4s\n"
+    "ld1 { v14.4s }, [x17]\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x3, #0x130]\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x14, x13]\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x14, x12]\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "ldr q0, [x3, #0x150]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "ldr q13, [x8, x13]\n"
+    "fmla v30.4s, v1.4s, v5.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x14, x11]\n"
+    "fmla v28.4s, v1.4s, v9.4s\n"
+    "ldr q1, [x3, #0x160]\n"
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "ld1 { v5.4s }, [x5]\n"
+    "fmla v30.4s, v2.4s, v6.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x14, x10]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x3, #0x170]\n"
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "ldr q6, [x5, x4]\n"
+    "fmla v30.4s, v3.4s, v8.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x5, x12]\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr q3, [x3, #0x180]\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "ldr q8, [x8, x4]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "ldr q10, [x8, x10]\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x5, x11]\n"
+    "fmla v28.4s, v4.4s, v9.4s\n"
+    "ldr q9, [x5, x13]\n"
+    "ldr q4, [x3, #0x190]\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "add x3, x3, #0x1a0\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "st1 { v31.4s }, [x7]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "str q30, [x7, x6]\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "add x7, x7, #0x10\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "st1 { v29.4s }, [x9]\n"
+    "str q28, [x9, x6]\n"
+    "add x9, x9, #0x10\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr q5, [x8, x12]\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+    "ldr q0, [x3, #0x0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q6, [x8, x11]\n"
+    "add x8, x8, #0x10\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "ldr q1, [x3, #0x10]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x5, x10]\n"
+    "add x5, x5, #0x10\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "ldr q2, [x3, #0x20]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x17, x4]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v5.4s\n"
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "ldr q3, [x3, #0x30]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x17, x13]\n"
+    "fmla v30.4s, v4.4s, v9.4s\n"
+    "ldr q9, [x17, x12]\n"
+    "fmla v29.4s, v4.4s, v6.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x3, #0x40]\n"
+    "fmla v31.4s, v0.4s, v7.4s\n"
+    "fmla v30.4s, v0.4s, v8.4s\n"
+    "fmla v29.4s, v0.4s, v14.4s\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr q0, [x3, #0x50]\n"
+    "fmla v31.4s, v1.4s, v8.4s\n"
+    "ldr q8, [x17, x10]\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q1, [x3, #0x60]\n"
+    "fmla v31.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x17, x11]\n"
+    "add x17, x17, #0x10\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "ldr q2, [x3, #0x70]\n"
+    "fmla v31.4s, v3.4s, v5.4s\n"
+    "ld1 { v5.4s }, [x16]\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "ldr q3, [x3, #0x80]\n"
+    "fmla v31.4s, v4.4s, v6.4s\n"
+    "ldr q6, [x16, x4]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "ldr q10, [x16, x13]\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "ldr q4, [x3, #0x90]\n"
+    "fmla v31.4s, v0.4s, v14.4s\n"
+    "ldr q14, [x16, x10]\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v5.4s\n"
+    "fmla v28.4s, v0.4s, v6.4s\n"
+    "ldr q0, [x3, #0xa0]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x16, x12]\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v10.4s\n"
+    "ldr q1, [x3, #0xb0]\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x16, x11]\n"
+    "add x16, x16, #0x10\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x3, #0xc0]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "ld1 { v9.4s }, [x15]\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr q3, [x3, #0xd0]\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x15, x4]\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "ldr q8, [x15, x11]\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v14.4s\n"
+    "ldr q4, [x3, #0xe0]\n"
+    "fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr q5, [x15, x13]\n"
+    "fmla v30.4s, v0.4s, v6.4s\n"
+    "fmla v29.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "ldr q0, [x3, #0xf0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q6, [x15, x12]\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v5.4s\n"
+    "ldr q1, [x3, #0x100]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "ldr q10, [x15, x10]\n"
+    "add x15, x15, #0x10\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "fmla v28.4s, v2.4s, v6.4s\n"
+    "ldr q2, [x3, #0x110]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ld1 { v11.4s }, [x14]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "fmla v28.4s, v3.4s, v8.4s\n"
+    "ldr q3, [x3, #0x120]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x14, x4]\n"
+    "fmla v30.4s, v4.4s, v14.4s\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x3, #0x130]\n"
+    "add x3, x3, #0x140\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x14, x13]\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x14, x12]\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v1.4s, v5.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x14, x11]\n"
+    "fmla v28.4s, v1.4s, v9.4s\n"
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "fmla v30.4s, v2.4s, v6.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x14, x10]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "fmla v30.4s, v3.4s, v8.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v9.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "st1 { v31.4s }, [x7]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "str q30, [x7, x6]\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "add x7, x7, #0x10\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "st1 { v29.4s }, [x9]\n"
+    "str q28, [x9, x6]\n"
+    "add x9, x9, #0x10\n"
+    "4:"  // Tile loop: Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 61f\n"
+    "ldr q16, [x3, #0x0]\n"
+    "ldr q0, [x3, #0x10]\n"
+    "add x28, x5, XZR\n"
+    "ldr q1, [x3, #0x20]\n"
+    "add x27, x5, x4\n"
+    "ldr q2, [x3, #0x30]\n"
+    "add x26, x8, XZR\n"
+    "ldr q3, [x3, #0x40]\n"
+    "add x25, x8, x4\n"
+    "ldr q4, [x3, #0x50]\n"
+    "add x24, x5, x13\n"
+    "add x23, x8, x13\n"
+    "add x22, x5, x12\n"
+    "add x21, x5, x11\n"
+    "add x20, x8, x10\n"
+    "add x19, x17, XZR\n"
+    "add x3, x3, #0x60\n"
+    "tbz %x[n_channels], #1, 5f\n"
+    "ldr d5, [x28], #0x8\n"
+    "ldr d6, [x27], #0x8\n"
+    "ldr d7, [x26], #0x8\n"
+    "ldr d8, [x25], #0x8\n"
+    "ldr d9, [x24], #0x8\n"
+    "ldr d13, [x23], #0x8\n"
+    "ldr d11, [x22], #0x8\n"
+    "ldr d12, [x21], #0x8\n"
+    "ldr d10, [x20], #0x8\n"
+    "ldr d14, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 6f\n"
+    "ld1 { v5.s }[2], [x28]\n"
+    "ld1 { v6.s }[2], [x27]\n"
+    "ld1 { v7.s }[2], [x26]\n"
+    "ld1 { v8.s }[2], [x25]\n"
+    "ld1 { v9.s }[2], [x24]\n"
+    "ld1 { v13.s }[2], [x23]\n"
+    "ld1 { v11.s }[2], [x22]\n"
+    "ld1 { v12.s }[2], [x21]\n"
+    "ld1 { v10.s }[2], [x20]\n"
+    "ld1 { v14.s }[2], [x19]\n"
+    "b 6f\n"
+    "5:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+    "ldr s5, [x28, #0x0]\n"
+    "ldr s6, [x27, #0x0]\n"
+    "ldr s7, [x26, #0x0]\n"
+    "ldr s8, [x25, #0x0]\n"
+    "ldr s9, [x24, #0x0]\n"
+    "ldr s13, [x23, #0x0]\n"
+    "ldr s11, [x22, #0x0]\n"
+    "ldr s12, [x21, #0x0]\n"
+    "ldr s10, [x20, #0x0]\n"
+    "ldr s14, [x19, #0x0]\n"
+    "6:"  // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "add x19, x8, x12\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ldr d5, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v5.s }[2], [x19]\n"
+    "b 8f\n"
+    "7:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+    "ldr s5, [x19, #0x0]\n"
+    "8:"  // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "add x19, x8, x11\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v5.4s\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d6, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v6.s }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+    "ldr s6, [x19, #0x0]\n"
+    "10:"  // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "add x19, x5, x10\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 12f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 12f\n"
+    "11:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "12:"  // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+    "fmla v30.4s, v4.4s, v9.4s\n"
+    "ldr s0, [x3, #0x18]\n"
+    "add x19, x17, x4\n"
+    "fmla v29.4s, v4.4s, v6.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v31.4s, v0.4s, v7.4s\n"
+    "fmla v30.4s, v0.4s, v8.4s\n"
+    "fmla v29.4s, v0.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 14f\n"
+    "13:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "14:"  // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr s1, [x3, #0x1c]\n"
+    "add x19, x17, x13\n"
+    "fmla v31.4s, v1.4s, v8.4s\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 16f\n"
+    "15:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "16:"  // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr s2, [x3, #0x20]\n"
+    "add x19, x17, x12\n"
+    "fmla v31.4s, v2.4s, v13.4s\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 17f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 18f\n"
+    "17:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "18:"  // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "ldr s3, [x3, #0x24]\n"
+    "add x19, x17, x11\n"
+    "fmla v31.4s, v3.4s, v5.4s\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 19f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 20f\n"
+    "19:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "20:"  // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "ldr s4, [x3, #0x28]\n"
+    "add x19, x17, x10\n"
+    "fmla v31.4s, v4.4s, v6.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 21f\n"
+    "ldr d8, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 22f\n"
+    "ld1 { v8.s }[2], [x19]\n"
+    "b 22f\n"
+    "21:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+    "ldr s8, [x19, #0x0]\n"
+    "22:"  // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "ldr s0, [x3, #0x2c]\n"
+    "add x19, x16, XZR\n"
+    "fmla v31.4s, v0.4s, v14.4s\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 23f\n"
+    "ldr d5, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 24f\n"
+    "ld1 { v5.s }[2], [x19]\n"
+    "b 24f\n"
+    "23:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+    "ldr s5, [x19, #0x0]\n"
+    "24:"  // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+    "fmla v29.4s, v0.4s, v5.4s\n"
+    "add x19, x16, x4\n"
+    "tbz %x[n_channels], #1, 25f\n"
+    "ldr d6, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 26f\n"
+    "ld1 { v6.s }[2], [x19]\n"
+    "b 26f\n"
+    "25:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+    "ldr s6, [x19, #0x0]\n"
+    "26:"  // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+    "fmla v28.4s, v0.4s, v6.4s\n"
+    "ldr s1, [x3, #0x30]\n"
+    "add x19, x16, x13\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v6.4s\n"
+    "tbz %x[n_channels], #1, 27f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 28f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 28f\n"
+    "27:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "28:"  // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+    "fmla v28.4s, v1.4s, v10.4s\n"
+    "ldr s2, [x3, #0x34]\n"
+    "add x19, x16, x12\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 29f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 30f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 30f\n"
+    "29:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "30:"  // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr s3, [x3, #0x38]\n"
+    "add x19, x16, x11\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 31f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 32f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 32f\n"
+    "31:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "32:"  // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr s4, [x3, #0x3c]\n"
+    "add x19, x16, x10\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 33f\n"
+    "ldr d14, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 34f\n"
+    "ld1 { v14.s }[2], [x19]\n"
+    "b 34f\n"
+    "33:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+    "ldr s14, [x19, #0x0]\n"
+    "34:"  // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+    "fmla v28.4s, v4.4s, v14.4s\n"
+    "ldr s0, [x3, #0x40]\n"
+    "add x19, x15, XZR\n"
+    "fmla v31.4s, v0.4s, v5.4s\n"
+    "fmla v30.4s, v0.4s, v6.4s\n"
+    "tbz %x[n_channels], #1, 35f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 36f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 36f\n"
+    "35:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "36:"  // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+    "fmla v29.4s, v0.4s, v9.4s\n"
+    "add x19, x15, x4\n"
+    "tbz %x[n_channels], #1, 37f\n"
+    "ldr d13, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 38f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 38f\n"
+    "37:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+    "ldr s13, [x19, #0x0]\n"
+    "38:"  // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "ldr s1, [x3, #0x44]\n"
+    "add x19, x15, x13\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 39f\n"
+    "ldr d5, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 40f\n"
+    "ld1 { v5.s }[2], [x19]\n"
+    "b 40f\n"
+    "39:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+    "ldr s5, [x19, #0x0]\n"
+    "40:"  // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+    "fmla v28.4s, v1.4s, v5.4s\n"
+    "ldr s2, [x3, #0x48]\n"
+    "add x19, x15, x12\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "tbz %x[n_channels], #1, 41f\n"
+    "ldr d6, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 42f\n"
+    "ld1 { v6.s }[2], [x19]\n"
+    "b 42f\n"
+    "41:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+    "ldr s6, [x19, #0x0]\n"
+    "42:"  // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v6.4s\n"
+    "ldr s3, [x3, #0x4c]\n"
+    "add x19, x15, x11\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "tbz %x[n_channels], #1, 43f\n"
+    "ldr d8, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 44f\n"
+    "ld1 { v8.s }[2], [x19]\n"
+    "b 44f\n"
+    "43:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+    "ldr s8, [x19, #0x0]\n"
+    "44:"  // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v8.4s\n"
+    "ldr s4, [x3, #0x50]\n"
+    "add x19, x15, x10\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "fmla v30.4s, v4.4s, v14.4s\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "tbz %x[n_channels], #1, 45f\n"
+    "ldr d10, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 46f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 46f\n"
+    "45:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+    "ldr s10, [x19, #0x0]\n"
+    "46:"  // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr s0, [x3, #0x54]\n"
+    "add x19, x14, XZR\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 47f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 48f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 48f\n"
+    "47:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "48:"  // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "add x19, x14, x4\n"
+    "tbz %x[n_channels], #1, 49f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 50f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 50f\n"
+    "49:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "50:"  // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "ldr s1, [x3, #0x58]\n"
+    "add x19, x14, x13\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v1.4s, v5.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 51f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 52f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 52f\n"
+    "51:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "52:"  // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+    "fmla v28.4s, v1.4s, v9.4s\n"
+    "ldr s2, [x3, #0x5c]\n"
+    "add x19, x14, x12\n"
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "fmla v30.4s, v2.4s, v6.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 53f\n"
+    "ldr d11, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 54f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 54f\n"
+    "53:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+    "ldr s11, [x19, #0x0]\n"
+    "54:"  // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr s3, [x3, #0x60]\n"
+    "add x19, x14, x11\n"
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "fmla v30.4s, v3.4s, v8.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 55f\n"
+    "ldr d12, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 56f\n"
+    "ld1 { v12.s }[2], [x19]\n"
+    "b 56f\n"
+    "55:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+    "ldr s12, [x19, #0x0]\n"
+    "56:"  // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr s4, [x3, #0x64]\n"
+    "add x19, x14, x10\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 57f\n"
+    "ldr d9, [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 58f\n"
+    "ld1 { v9.s }[2], [x19]\n"
+    "b 58f\n"
+    "57:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+    "ldr s9, [x19, #0x0]\n"
+    "58:"  // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+    "fmla v28.4s, v4.4s, v9.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "tbz %x[n_channels], #1, 59f\n"
+    "mov x19, x7\n"
+    "st1 { v31.d }[0], [x19], x6\n"
+    "add x7, x7, #0x8\n"
+    "st1 { v30.d }[0], [x19]\n"
+    "mov x19, x9\n"
+    "st1 { v29.d }[0], [x19], x6\n"
+    "add x9, x9, #0x8\n"
+    "st1 { v28.d }[0], [x19]\n"
+    "tbz %x[n_channels], #0, 60f\n"
+    "mov x20, x7\n"
+    "st1 { v31.s }[2], [x20], x6\n"
+    "mov x19, x9\n"
+    "st1 { v30.s }[2], [x20]\n"
+    "st1 { v29.s }[2], [x19], x6\n"
+    "st1 { v28.s }[2], [x19]\n"
+    "b 60f\n"
+    "59:"  // Tile loop: Oddments: Store: Bit 1: Unset
+    "mov x20, x7\n"
+    "st1 { v31.s }[0], [x20], x6\n"
+    "mov x19, x9\n"
+    "st1 { v30.s }[0], [x20]\n"
+    "st1 { v29.s }[0], [x19], x6\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "60:"  // Tile loop: Oddments: Store: Bit 1: End
+
+    "61:"  // Tile loop: End
+    "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x28, #0x1\n"
+    "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "add x27, x27, #0x1\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x27, x19\n"
+    "csel x27, x27, XZR, LT\n"
+    "csel x28, x28, x21, LT\n"
+    "cmp x28, x20\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..de66a8c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x20, %x[params_struct], %[offsetof_args_min]\n"
+    "add x19, %x[params_struct], %[offsetof_args_max]\n"
+    "ld1r { v18.4s }, [x20]\n"
+    "ld1r { v17.4s }, [x19]\n"
+    "mov x14, #0x0\n"
+    "ldp x13, x12, [x21, #0x0]\n"
+    "mov x11, #0x10\n" // cntb _, ALL, #1
+    "ldp x10, x9, [x21, #0x10]\n"
+    "sub x28, XZR, x11\n"
+    "lsr x27, %x[n_channels], #0x2\n"
+    "cbz x27, 3f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "cmp x11, x27, LSL #4\n"
+    "ldr q1, [x15, #0x20]\n"
+    "ldr q2, [x15, #0x30]\n"
+    "ldr q3, [x15, #0x40]\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x15, x15, #0x60\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "ldr q5, [x26, x14]\n"
+    "ldr q6, [x25, x14]\n"
+    "ldr q7, [x24, x14]\n"
+    "ldr q8, [x23, x14]\n"
+    "ldr q9, [x22, x14]\n"
+    "ldr q13, [x21, x14]\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "ldp x26, x25, [x16, #0x40]\n"
+    "ldr q11, [x20, x14]\n"
+    "ldr q12, [x19, x14]\n"
+    "ldr q10, [x26, x14]\n"
+    "ldr q14, [x25, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr x24, [x16, #0x50]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+    "ldr x22, [x16, #0x60]\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr q0, [x15, #0x0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q6, [x23, x14]\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v8.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "ldr q1, [x15, #0x10]\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x22, x14]\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "ldr q2, [x15, #0x20]\n"
+    "ldr x26, [x16, #0x80]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x21, x14]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v5.4s\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "ldr q3, [x15, #0x30]\n"
+    "ldr x24, [x16, #0x90]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x20, x14]\n"
+    "fmla v30.4s, v4.4s, v9.4s\n"
+    "fmla v29.4s, v4.4s, v6.4s\n"
+    "ldr q9, [x19, x14]\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x15, #0x40]\n"
+    "ldr x23, [x16, #0x98]\n"
+    "fmla v31.4s, v0.4s, v7.4s\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v30.4s, v0.4s, v8.4s\n"
+    "fmla v29.4s, v0.4s, v14.4s\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr q0, [x15, #0x50]\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v31.4s, v1.4s, v8.4s\n"
+    "ldr q8, [x25, x14]\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q1, [x15, #0x60]\n"
+    "ldr x25, [x16, #0xc8]\n"
+    "fmla v31.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x26, x14]\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "ldr q2, [x15, #0x70]\n"
+    "ldr q16, [x15, #0x140]\n"
+    "fmla v31.4s, v3.4s, v5.4s\n"
+    "ldr q5, [x24, x14]\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "ldr x24, [x16, #0xd0]\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "ldr q3, [x15, #0x80]\n"
+    "fmla v31.4s, v4.4s, v6.4s\n"
+    "ldr q6, [x23, x14]\n"
+    "ldr x23, [x16, #0xd8]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "ldr q10, [x22, x14]\n"
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "ldr q4, [x15, #0x90]\n"
+    "ldr x22, [x16, #0xe0]\n"
+    "fmla v31.4s, v0.4s, v14.4s\n"
+    "ldr q14, [x19, x14]\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v5.4s\n"
+    "ldr x19, [x16, #0xf8]\n"
+    "fmla v28.4s, v0.4s, v6.4s\n"
+    "ldr q0, [x15, #0xa0]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr x21, [x16, #0xe8]\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v10.4s\n"
+    "ldr q1, [x15, #0xb0]\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x20, x14]\n"
+    "ldr x20, [x16, #0xf0]\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x15, #0xc0]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr x26, [x16, #0x100]\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr q3, [x15, #0xd0]\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x25, x14]\n"
+    "ldr x25, [x16, #0x108]\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "ldr q8, [x22, x14]\n"
+    "fmla v28.4s, v4.4s, v14.4s\n"
+    "ldr q4, [x15, #0xe0]\n"
+    "fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr x24, [x16, #0x110]\n"
+    "fmla v30.4s, v0.4s, v6.4s\n"
+    "fmla v29.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "ldr q0, [x15, #0xf0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q6, [x23, x14]\n"
+    "ldr x23, [x16, #0x118]\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v5.4s\n"
+    "ldr q1, [x15, #0x100]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "ldr q10, [x21, x14]\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "fmla v28.4s, v2.4s, v6.4s\n"
+    "ldr q2, [x15, #0x110]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "fmla v28.4s, v3.4s, v8.4s\n"
+    "ldr q3, [x15, #0x120]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x19, x14]\n"
+    "fmla v30.4s, v4.4s, v14.4s\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x15, #0x130]\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x26, x14]\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    "ldr q0, [x15, #0x150]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v1.4s, v5.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v28.4s, v1.4s, v9.4s\n"
+    "ldr q1, [x15, #0x160]\n"
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "ldr q5, [x26, x11]\n"
+    "fmla v30.4s, v2.4s, v6.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x23, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "ldr q6, [x25, x11]\n"
+    "fmla v30.4s, v3.4s, v8.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "ldr q7, [x24, x11]\n"
+    "ldr q13, [x21, x11]\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "ldr q8, [x23, x11]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "ldr q11, [x20, x11]\n"
+    "ldr q12, [x19, x11]\n"
+    "fmla v28.4s, v4.4s, v9.4s\n"
+    "ldr q9, [x22, x11]\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "ldp x26, x25, [x16, #0x40]\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "ldr q2, [x15, #0x170]\n"
+    "ldr q3, [x15, #0x180]\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "ldr q10, [x26, x11]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "ldr q14, [x25, x11]\n"
+    "add x11, x11, #0x10\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "str q31, [x13, x28]\n"
+    "cmp x11, x27, LSL #4\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "str q30, [x12, x28]\n"
+    "ldr q4, [x15, #0x190]\n"
+    "add x15, x15, #0x1a0\n"
+    "str q29, [x10, x28]\n"
+    "str q28, [x9, x28]\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr x24, [x16, #0x50]\n"
+    "add x28, x28, #0x10\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+    "ldr x22, [x16, #0x60]\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr q0, [x15, #0x0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q6, [x23, x14]\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v8.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "ldr q1, [x15, #0x10]\n"
+    "ldr x20, [x16, #0x70]\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x22, x14]\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "ldr x19, [x16, #0x78]\n"
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "ldr q2, [x15, #0x20]\n"
+    "ldr x26, [x16, #0x80]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x21, x14]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v5.4s\n"
+    "ldr x25, [x16, #0x88]\n"
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "ldr q3, [x15, #0x30]\n"
+    "ldr x24, [x16, #0x90]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x20, x14]\n"
+    "fmla v30.4s, v4.4s, v9.4s\n"
+    "fmla v29.4s, v4.4s, v6.4s\n"
+    "ldr q9, [x19, x14]\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x15, #0x40]\n"
+    "ldr x23, [x16, #0x98]\n"
+    "fmla v31.4s, v0.4s, v7.4s\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "fmla v30.4s, v0.4s, v8.4s\n"
+    "fmla v29.4s, v0.4s, v14.4s\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr q0, [x15, #0x50]\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "fmla v31.4s, v1.4s, v8.4s\n"
+    "ldr q8, [x25, x14]\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr q1, [x15, #0x60]\n"
+    "ldr x25, [x16, #0xc8]\n"
+    "fmla v31.4s, v2.4s, v13.4s\n"
+    "ldr q13, [x26, x14]\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "ldr q2, [x15, #0x70]\n"
+    "fmla v31.4s, v3.4s, v5.4s\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr x24, [x16, #0xd0]\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "ldr q3, [x15, #0x80]\n"
+    "fmla v31.4s, v4.4s, v6.4s\n"
+    "ldr q6, [x23, x14]\n"
+    "ldr x23, [x16, #0xd8]\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "ldr q10, [x22, x14]\n"
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "ldr q4, [x15, #0x90]\n"
+    "ldr x22, [x16, #0xe0]\n"
+    "fmla v31.4s, v0.4s, v14.4s\n"
+    "ldr q14, [x19, x14]\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "fmla v29.4s, v0.4s, v5.4s\n"
+    "ldr x19, [x16, #0xf8]\n"
+    "fmla v28.4s, v0.4s, v6.4s\n"
+    "ldr q0, [x15, #0xa0]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr q11, [x21, x14]\n"
+    "ldr x21, [x16, #0xe8]\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v6.4s\n"
+    "fmla v28.4s, v1.4s, v10.4s\n"
+    "ldr q1, [x15, #0xb0]\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "ldr q12, [x20, x14]\n"
+    "ldr x20, [x16, #0xf0]\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr q2, [x15, #0xc0]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "ldr q9, [x26, x14]\n"
+    "ldr x26, [x16, #0x100]\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr q3, [x15, #0xd0]\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "ldr q13, [x25, x14]\n"
+    "ldr x25, [x16, #0x108]\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "ldr q8, [x22, x14]\n"
+    "fmla v28.4s, v4.4s, v14.4s\n"
+    "ldr q4, [x15, #0xe0]\n"
+    "fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr q5, [x24, x14]\n"
+    "ldr x24, [x16, #0x110]\n"
+    "fmla v30.4s, v0.4s, v6.4s\n"
+    "fmla v29.4s, v0.4s, v9.4s\n"
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "ldr q0, [x15, #0xf0]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr q6, [x23, x14]\n"
+    "ldr x23, [x16, #0x118]\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "fmla v28.4s, v1.4s, v5.4s\n"
+    "ldr q1, [x15, #0x100]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "ldr q10, [x21, x14]\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "fmla v28.4s, v2.4s, v6.4s\n"
+    "ldr q2, [x15, #0x110]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr q11, [x20, x14]\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "fmla v28.4s, v3.4s, v8.4s\n"
+    "ldr q3, [x15, #0x120]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr q12, [x19, x14]\n"
+    "fmla v30.4s, v4.4s, v14.4s\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr q4, [x15, #0x130]\n"
+    "add x15, x15, #0x140\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "ldr q9, [x26, x14]\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr q11, [x25, x14]\n"
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "fmla v30.4s, v1.4s, v5.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "ldr q12, [x24, x14]\n"
+    "fmla v28.4s, v1.4s, v9.4s\n"
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "fmla v30.4s, v2.4s, v6.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "ldr q9, [x23, x14]\n"
+    "add x14, x14, #0x10\n"
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "fmla v30.4s, v3.4s, v8.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "fmla v28.4s, v4.4s, v9.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "str q31, [x13, x28]\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "str q30, [x12, x28]\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "str q29, [x10, x28]\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "str q28, [x9, x28]\n"
+    "3:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 60f\n"
+    "ldr q16, [x15, #0x0]\n"
+    "ldr q0, [x15, #0x10]\n"
+    "mov x28, x14\n"
+    "ldr q1, [x15, #0x20]\n"
+    "add x13, x13, x28\n"
+    "ldr q2, [x15, #0x30]\n"
+    "add x12, x12, x28\n"
+    "ldr q3, [x15, #0x40]\n"
+    "add x10, x10, x28\n"
+    "ldr q4, [x15, #0x50]\n"
+    "add x9, x9, x28\n"
+    "ldr x24, [x16, #0x10]\n"
+    "ldr x23, [x16, #0x18]\n"
+    "ldr x22, [x16, #0x20]\n"
+    "add x24, x24, x14\n"
+    "ldr x21, [x16, #0x28]\n"
+    "add x23, x23, x14\n"
+    "ldr x20, [x16, #0x30]\n"
+    "add x22, x22, x14\n"
+    "ldr x19, [x16, #0x38]\n"
+    "add x21, x21, x14\n"
+    "ldr x26, [x16, #0x40]\n"
+    "add x20, x20, x14\n"
+    "ldr x25, [x16, #0x48]\n"
+    "add x19, x19, x14\n"
+    "add x26, x26, x14\n"
+    "add x25, x25, x14\n"
+    "add x15, x15, #0x60\n"
+    "tbz %x[n_channels], #1, 4f\n"
+    "ld1 { v5.d }[0], [x26], #0x8\n"
+    "ld1 { v6.d }[0], [x25], #0x8\n"
+    "ld1 { v7.d }[0], [x24], #0x8\n"
+    "ld1 { v8.d }[0], [x23], #0x8\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "ld1 { v13.d }[0], [x21], #0x8\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "ld1 { v12.d }[0], [x19], #0x8\n"
+    "ld1 { v10.d }[0], [x26], #0x8\n"
+    "ld1 { v14.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 5f\n"
+    "ld1 { v7.s }[2], [x24], #0x4\n"
+    "ld1 { v8.s }[2], [x23], #0x4\n"
+    "ld1 { v5.s }[2], [x26], #0x4\n"
+    "ld1 { v6.s }[2], [x25], #0x4\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "ld1 { v13.s }[2], [x21], #0x4\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "ld1 { v12.s }[2], [x19], #0x4\n"
+    "ld1 { v10.s }[2], [x26], #0x4\n"
+    "ld1 { v14.s }[2], [x25], #0x4\n"
+    "b 5f\n"
+    "4:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+    "ld1 { v5.s }[0], [x26], #0x4\n"
+    "ld1 { v6.s }[0], [x25], #0x4\n"
+    "ld1 { v7.s }[0], [x24], #0x4\n"
+    "ld1 { v8.s }[0], [x23], #0x4\n"
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "ld1 { v13.s }[0], [x21], #0x4\n"
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "ld1 { v12.s }[0], [x19], #0x4\n"
+    "ld1 { v10.s }[0], [x26], #0x4\n"
+    "ld1 { v14.s }[0], [x25], #0x4\n"
+    "5:"  // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+    "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr x24, [x16, #0x50]\n"
+    "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n"
+    "add x24, x24, x14\n"
+    "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+    "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "fmla v30.4s, v1.4s, v9.4s\n"
+    "fmla v29.4s, v1.4s, v8.4s\n"
+    "fmla v28.4s, v1.4s, v13.4s\n"
+    "fmla v31.4s, v2.4s, v9.4s\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v5.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v5.s }[2], [x24], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load input (1, 3): Bit 1: Unset
+    "ld1 { v5.s }[0], [x24], #0x4\n"
+    "7:"  // Oddments: Load input (1, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v5.4s\n"
+    "ldr x23, [x16, #0x58]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "add x23, x23, x14\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v5.4s\n"
+    "tbz %x[n_channels], #1, 8f\n"
+    "ld1 { v6.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 9f\n"
+    "ld1 { v6.s }[2], [x23], #0x4\n"
+    "b 9f\n"
+    "8:"  // Oddments: Load input (1, 4): Bit 1: Unset
+    "ld1 { v6.s }[0], [x23], #0x4\n"
+    "9:"  // Oddments: Load input (1, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v6.4s\n"
+    "ldr x22, [x16, #0x60]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "add x22, x22, x14\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ld1 { v9.d }[0], [x22], #0x8\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v9.s }[2], [x22], #0x4\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load input (0, 5): Bit 1: Unset
+    "ld1 { v9.s }[0], [x22], #0x4\n"
+    "11:"  // Oddments: Load input (0, 5): Bit 1: End
+    "fmla v30.4s, v4.4s, v9.4s\n"
+    "ldr s0, [x15, #0x18]\n"
+    "fmla v29.4s, v4.4s, v6.4s\n"
+    "ldr x21, [x16, #0x68]\n"
+    "add x21, x21, x14\n"
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "fmla v31.4s, v0.4s, v7.4s\n"
+    "fmla v30.4s, v0.4s, v8.4s\n"
+    "fmla v29.4s, v0.4s, v14.4s\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ld1 { v11.d }[0], [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Load input (2, 1): Bit 1: Unset
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "13:"  // Oddments: Load input (2, 1): Bit 1: End
+    "fmla v28.4s, v0.4s, v11.4s\n"
+    "ldr s1, [x15, #0x1c]\n"
+    "fmla v31.4s, v1.4s, v8.4s\n"
+    "ldr x20, [x16, #0x70]\n"
+    "add x20, x20, x14\n"
+    "fmla v30.4s, v1.4s, v13.4s\n"
+    "fmla v29.4s, v1.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 14f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 15f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load input (2, 2): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "15:"  // Oddments: Load input (2, 2): Bit 1: End
+    "fmla v28.4s, v1.4s, v12.4s\n"
+    "ldr s2, [x15, #0x20]\n"
+    "fmla v31.4s, v2.4s, v13.4s\n"
+    "ldr x19, [x16, #0x78]\n"
+    "add x19, x19, x14\n"
+    "fmla v30.4s, v2.4s, v5.4s\n"
+    "fmla v29.4s, v2.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 16f\n"
+    "ld1 { v9.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 17f\n"
+    "ld1 { v9.s }[2], [x19], #0x4\n"
+    "b 17f\n"
+    "16:"  // Oddments: Load input (2, 3): Bit 1: Unset
+    "ld1 { v9.s }[0], [x19], #0x4\n"
+    "17:"  // Oddments: Load input (2, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v9.4s\n"
+    "ldr s3, [x15, #0x24]\n"
+    "fmla v31.4s, v3.4s, v5.4s\n"
+    "ldr x26, [x16, #0x80]\n"
+    "add x26, x26, x14\n"
+    "fmla v30.4s, v3.4s, v6.4s\n"
+    "fmla v29.4s, v3.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v13.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 19f\n"
+    "ld1 { v13.s }[2], [x26], #0x4\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load input (2, 4): Bit 1: Unset
+    "ld1 { v13.s }[0], [x26], #0x4\n"
+    "19:"  // Oddments: Load input (2, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v13.4s\n"
+    "ldr s4, [x15, #0x28]\n"
+    "fmla v31.4s, v4.4s, v6.4s\n"
+    "ldr x25, [x16, #0x88]\n"
+    "add x25, x25, x14\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 20f\n"
+    "ld1 { v8.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 21f\n"
+    "ld1 { v8.s }[2], [x25], #0x4\n"
+    "b 21f\n"
+    "20:"  // Oddments: Load input (2, 5): Bit 1: Unset
+    "ld1 { v8.s }[0], [x25], #0x4\n"
+    "21:"  // Oddments: Load input (2, 5): Bit 1: End
+    "fmla v28.4s, v4.4s, v8.4s\n"
+    "ldr s0, [x15, #0x2c]\n"
+    "fmla v31.4s, v0.4s, v14.4s\n"
+    "ldr x24, [x16, #0x90]\n"
+    "add x24, x24, x14\n"
+    "fmla v30.4s, v0.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "ld1 { v5.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "ld1 { v5.s }[2], [x24], #0x4\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load input (3, 0): Bit 1: Unset
+    "ld1 { v5.s }[0], [x24], #0x4\n"
+    "23:"  // Oddments: Load input (3, 0): Bit 1: End
+    "fmla v29.4s, v0.4s, v5.4s\n"
+    "ldr x23, [x16, #0x98]\n"
+    "add x23, x23, x14\n"
+    "tbz %x[n_channels], #1, 24f\n"
+    "ld1 { v6.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 25f\n"
+    "ld1 { v6.s }[2], [x23], #0x4\n"
+    "b 25f\n"
+    "24:"  // Oddments: Load input (3, 1): Bit 1: Unset
+    "ld1 { v6.s }[0], [x23], #0x4\n"
+    "25:"  // Oddments: Load input (3, 1): Bit 1: End
+    "fmla v28.4s, v0.4s, v6.4s\n"
+    "ldr s1, [x15, #0x30]\n"
+    "fmla v31.4s, v1.4s, v11.4s\n"
+    "ldr x22, [x16, #0xa0]\n"
+    "add x22, x22, x14\n"
+    "fmla v30.4s, v1.4s, v12.4s\n"
+    "fmla v29.4s, v1.4s, v6.4s\n"
+    "tbz %x[n_channels], #1, 26f\n"
+    "ld1 { v10.d }[0], [x22], #0x8\n"
+    "tbz %x[n_channels], #0, 27f\n"
+    "ld1 { v10.s }[2], [x22], #0x4\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load input (3, 2): Bit 1: Unset
+    "ld1 { v10.s }[0], [x22], #0x4\n"
+    "27:"  // Oddments: Load input (3, 2): Bit 1: End
+    "fmla v28.4s, v1.4s, v10.4s\n"
+    "ldr s2, [x15, #0x34]\n"
+    "fmla v31.4s, v2.4s, v12.4s\n"
+    "ldr x21, [x16, #0xa8]\n"
+    "add x21, x21, x14\n"
+    "fmla v30.4s, v2.4s, v9.4s\n"
+    "fmla v29.4s, v2.4s, v10.4s\n"
+    "tbz %x[n_channels], #1, 28f\n"
+    "ld1 { v11.d }[0], [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 29f\n"
+    "ld1 { v11.s }[2], [x21], #0x4\n"
+    "b 29f\n"
+    "28:"  // Oddments: Load input (3, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x21], #0x4\n"
+    "29:"  // Oddments: Load input (3, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr s3, [x15, #0x38]\n"
+    "fmla v31.4s, v3.4s, v9.4s\n"
+    "ldr x20, [x16, #0xb0]\n"
+    "add x20, x20, x14\n"
+    "fmla v30.4s, v3.4s, v13.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 30f\n"
+    "ld1 { v12.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 31f\n"
+    "ld1 { v12.s }[2], [x20], #0x4\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load input (3, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x20], #0x4\n"
+    "31:"  // Oddments: Load input (3, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr s4, [x15, #0x3c]\n"
+    "fmla v31.4s, v4.4s, v13.4s\n"
+    "ldr x19, [x16, #0xb8]\n"
+    "add x19, x19, x14\n"
+    "fmla v30.4s, v4.4s, v8.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 32f\n"
+    "ld1 { v14.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 33f\n"
+    "ld1 { v14.s }[2], [x19], #0x4\n"
+    "b 33f\n"
+    "32:"  // Oddments: Load input (3, 5): Bit 1: Unset
+    "ld1 { v14.s }[0], [x19], #0x4\n"
+    "33:"  // Oddments: Load input (3, 5): Bit 1: End
+    "fmla v28.4s, v4.4s, v14.4s\n"
+    "ldr s0, [x15, #0x40]\n"
+    "fmla v31.4s, v0.4s, v5.4s\n"
+    "ldr x26, [x16, #0xc0]\n"
+    "add x26, x26, x14\n"
+    "fmla v30.4s, v0.4s, v6.4s\n"
+    "tbz %x[n_channels], #1, 34f\n"
+    "ld1 { v9.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 35f\n"
+    "ld1 { v9.s }[2], [x26], #0x4\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load input (4, 0): Bit 1: Unset
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "35:"  // Oddments: Load input (4, 0): Bit 1: End
+    "fmla v29.4s, v0.4s, v9.4s\n"
+    "ldr x25, [x16, #0xc8]\n"
+    "add x25, x25, x14\n"
+    "tbz %x[n_channels], #1, 36f\n"
+    "ld1 { v13.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 37f\n"
+    "ld1 { v13.s }[2], [x25], #0x4\n"
+    "b 37f\n"
+    "36:"  // Oddments: Load input (4, 1): Bit 1: Unset
+    "ld1 { v13.s }[0], [x25], #0x4\n"
+    "37:"  // Oddments: Load input (4, 1): Bit 1: End
+    "fmla v28.4s, v0.4s, v13.4s\n"
+    "ldr s1, [x15, #0x44]\n"
+    "fmla v31.4s, v1.4s, v6.4s\n"
+    "ldr x24, [x16, #0xd0]\n"
+    "add x24, x24, x14\n"
+    "fmla v30.4s, v1.4s, v10.4s\n"
+    "fmla v29.4s, v1.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 38f\n"
+    "ld1 { v5.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 39f\n"
+    "ld1 { v5.s }[2], [x24], #0x4\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load input (4, 2): Bit 1: Unset
+    "ld1 { v5.s }[0], [x24], #0x4\n"
+    "39:"  // Oddments: Load input (4, 2): Bit 1: End
+    "fmla v28.4s, v1.4s, v5.4s\n"
+    "ldr s2, [x15, #0x48]\n"
+    "fmla v31.4s, v2.4s, v10.4s\n"
+    "ldr x23, [x16, #0xd8]\n"
+    "add x23, x23, x14\n"
+    "fmla v30.4s, v2.4s, v11.4s\n"
+    "fmla v29.4s, v2.4s, v5.4s\n"
+    "tbz %x[n_channels], #1, 40f\n"
+    "ld1 { v6.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 41f\n"
+    "ld1 { v6.s }[2], [x23], #0x4\n"
+    "b 41f\n"
+    "40:"  // Oddments: Load input (4, 3): Bit 1: Unset
+    "ld1 { v6.s }[0], [x23], #0x4\n"
+    "41:"  // Oddments: Load input (4, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v6.4s\n"
+    "ldr s3, [x15, #0x4c]\n"
+    "fmla v31.4s, v3.4s, v11.4s\n"
+    "ldr x22, [x16, #0xe0]\n"
+    "add x22, x22, x14\n"
+    "fmla v30.4s, v3.4s, v12.4s\n"
+    "fmla v29.4s, v3.4s, v6.4s\n"
+    "tbz %x[n_channels], #1, 42f\n"
+    "ld1 { v8.d }[0], [x22], #0x8\n"
+    "tbz %x[n_channels], #0, 43f\n"
+    "ld1 { v8.s }[2], [x22], #0x4\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load input (4, 4): Bit 1: Unset
+    "ld1 { v8.s }[0], [x22], #0x4\n"
+    "43:"  // Oddments: Load input (4, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v8.4s\n"
+    "ldr s4, [x15, #0x50]\n"
+    "fmla v31.4s, v4.4s, v12.4s\n"
+    "ldr x21, [x16, #0xe8]\n"
+    "add x21, x21, x14\n"
+    "fmla v30.4s, v4.4s, v14.4s\n"
+    "fmla v29.4s, v4.4s, v8.4s\n"
+    "tbz %x[n_channels], #1, 44f\n"
+    "ld1 { v10.d }[0], [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 45f\n"
+    "ld1 { v10.s }[2], [x21], #0x4\n"
+    "b 45f\n"
+    "44:"  // Oddments: Load input (4, 5): Bit 1: Unset
+    "ld1 { v10.s }[0], [x21], #0x4\n"
+    "45:"  // Oddments: Load input (4, 5): Bit 1: End
+    "fmla v28.4s, v4.4s, v10.4s\n"
+    "ldr s0, [x15, #0x54]\n"
+    "fmla v31.4s, v0.4s, v9.4s\n"
+    "ldr x20, [x16, #0xf0]\n"
+    "add x20, x20, x14\n"
+    "fmla v30.4s, v0.4s, v13.4s\n"
+    "tbz %x[n_channels], #1, 46f\n"
+    "ld1 { v11.d }[0], [x20], #0x8\n"
+    "tbz %x[n_channels], #0, 47f\n"
+    "ld1 { v11.s }[2], [x20], #0x4\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load input (5, 0): Bit 1: Unset
+    "ld1 { v11.s }[0], [x20], #0x4\n"
+    "47:"  // Oddments: Load input (5, 0): Bit 1: End
+    "fmla v29.4s, v0.4s, v11.4s\n"
+    "ldr x19, [x16, #0xf8]\n"
+    "add x19, x19, x14\n"
+    "tbz %x[n_channels], #1, 48f\n"
+    "ld1 { v12.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 49f\n"
+    "ld1 { v12.s }[2], [x19], #0x4\n"
+    "b 49f\n"
+    "48:"  // Oddments: Load input (5, 1): Bit 1: Unset
+    "ld1 { v12.s }[0], [x19], #0x4\n"
+    "49:"  // Oddments: Load input (5, 1): Bit 1: End
+    "fmla v28.4s, v0.4s, v12.4s\n"
+    "ldr s1, [x15, #0x58]\n"
+    "fmla v31.4s, v1.4s, v13.4s\n"
+    "ldr x26, [x16, #0x100]\n"
+    "add x26, x26, x14\n"
+    "fmla v30.4s, v1.4s, v5.4s\n"
+    "fmla v29.4s, v1.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 50f\n"
+    "ld1 { v9.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #0, 51f\n"
+    "ld1 { v9.s }[2], [x26], #0x4\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load input (5, 2): Bit 1: Unset
+    "ld1 { v9.s }[0], [x26], #0x4\n"
+    "51:"  // Oddments: Load input (5, 2): Bit 1: End
+    "fmla v28.4s, v1.4s, v9.4s\n"
+    "ldr s2, [x15, #0x5c]\n"
+    "fmla v31.4s, v2.4s, v5.4s\n"
+    "ldr x25, [x16, #0x108]\n"
+    "add x25, x25, x14\n"
+    "fmla v30.4s, v2.4s, v6.4s\n"
+    "fmla v29.4s, v2.4s, v9.4s\n"
+    "tbz %x[n_channels], #1, 52f\n"
+    "ld1 { v11.d }[0], [x25], #0x8\n"
+    "tbz %x[n_channels], #0, 53f\n"
+    "ld1 { v11.s }[2], [x25], #0x4\n"
+    "b 53f\n"
+    "52:"  // Oddments: Load input (5, 3): Bit 1: Unset
+    "ld1 { v11.s }[0], [x25], #0x4\n"
+    "53:"  // Oddments: Load input (5, 3): Bit 1: End
+    "fmla v28.4s, v2.4s, v11.4s\n"
+    "ldr s3, [x15, #0x60]\n"
+    "fmla v31.4s, v3.4s, v6.4s\n"
+    "ldr x24, [x16, #0x110]\n"
+    "add x24, x24, x14\n"
+    "fmla v30.4s, v3.4s, v8.4s\n"
+    "fmla v29.4s, v3.4s, v11.4s\n"
+    "tbz %x[n_channels], #1, 54f\n"
+    "ld1 { v12.d }[0], [x24], #0x8\n"
+    "tbz %x[n_channels], #0, 55f\n"
+    "ld1 { v12.s }[2], [x24], #0x4\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load input (5, 4): Bit 1: Unset
+    "ld1 { v12.s }[0], [x24], #0x4\n"
+    "55:"  // Oddments: Load input (5, 4): Bit 1: End
+    "fmla v28.4s, v3.4s, v12.4s\n"
+    "ldr s4, [x15, #0x64]\n"
+    "fmla v31.4s, v4.4s, v8.4s\n"
+    "ldr x23, [x16, #0x118]\n"
+    "add x23, x23, x14\n"
+    "fmla v30.4s, v4.4s, v10.4s\n"
+    "fmla v29.4s, v4.4s, v12.4s\n"
+    "tbz %x[n_channels], #1, 56f\n"
+    "ld1 { v9.d }[0], [x23], #0x8\n"
+    "tbz %x[n_channels], #0, 57f\n"
+    "ld1 { v9.s }[2], [x23], #0x4\n"
+    "b 57f\n"
+    "56:"  // Oddments: Load input (5, 5): Bit 1: Unset
+    "ld1 { v9.s }[0], [x23], #0x4\n"
+    "57:"  // Oddments: Load input (5, 5): Bit 1: End
+    "fmla v28.4s, v4.4s, v9.4s\n"
+    "fmax v31.4s, v31.4s, v18.4s\n"
+    "fmax v30.4s, v30.4s, v18.4s\n"
+    "fmax v29.4s, v29.4s, v18.4s\n"
+    "fmin v31.4s, v31.4s, v17.4s\n"
+    "fmin v30.4s, v30.4s, v17.4s\n"
+    "fmin v29.4s, v29.4s, v17.4s\n"
+    "fmax v28.4s, v28.4s, v18.4s\n"
+    "fmin v28.4s, v28.4s, v17.4s\n"
+    "tbz %x[n_channels], #1, 58f\n"
+    "st1 { v31.d }[0], [x13], #0x8\n"
+    "st1 { v30.d }[0], [x12], #0x8\n"
+    "st1 { v29.d }[0], [x10], #0x8\n"
+    "st1 { v28.d }[0], [x9], #0x8\n"
+    "tbz %x[n_channels], #0, 59f\n"
+    "st1 { v31.s }[2], [x13], #0x4\n"
+    "st1 { v30.s }[2], [x12], #0x4\n"
+    "st1 { v29.s }[2], [x10], #0x4\n"
+    "st1 { v28.s }[2], [x9], #0x4\n"
+    "b 59f\n"
+    "58:"  // Oddments: Store: Bit 1: Unset
+    "st1 { v31.s }[0], [x13], #0x4\n"
+    "st1 { v30.s }[0], [x12], #0x4\n"
+    "st1 { v29.s }[0], [x10], #0x4\n"
+    "st1 { v28.s }[0], [x9], #0x4\n"
+    "59:"  // Oddments: Store: Bit 1: End
+
+    "60:"  // End
+
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000..0f6cecd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+struct a64_fp32_nhwc_generic_output9_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int n_output_points = 9;
+
+  kern_type kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+  a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..e8e817e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp

@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v4.4s }, [%x[minmax_vals]]\n"
+    "add x19, %x[minmax_vals], #0x4\n"
+    "mov x11, #0x0\n"
+    "ld1r { v3.4s }, [x19]\n"
+    "lsr x10, %x[n_channels], #0x2\n"
+    "cbz x10, 5f\n"
+    "1:"  // Channel loop
+    "movi v25.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ldr q25, [%x[bias], x11]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "mov v24.16b, v25.16b\n"
+    "ldr q23, [%x[params], #0x0]\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v22.16b, v25.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, %x[n_points], #0x1\n"
+    "mov v21.16b, v25.16b\n"
+    "ldr q2, [x9, x11]\n"
+    "mov v20.16b, v25.16b\n"
+    "add %x[params], %x[params], #0x10\n"
+    "mov v19.16b, v25.16b\n"
+    "ldr q1, [x28, x11]\n"
+    "mov v18.16b, v25.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v17.16b, v25.16b\n"
+    "ldr q0, [x27, x11]\n"
+    "mov v16.16b, v25.16b\n"
+    "ldr q31, [x26, x11]\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "ldr q30, [x25, x11]\n"
+    "ldr q29, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "ldr q28, [x23, x11]\n"
+    "ldr q27, [x22, x11]\n"
+    "ldr x21, [x20], #0x8\n"
+    "ldr q26, [x21, x11]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "fmla v25.4s, v2.4s, v23.4s\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, x19, #0x1\n"
+    "fmla v24.4s, v1.4s, v23.4s\n"
+    "ldr q2, [x9, x11]\n"
+    "fmla v22.4s, v0.4s, v23.4s\n"
+    "fmla v21.4s, v31.4s, v23.4s\n"
+    "ldr q1, [x28, x11]\n"
+    "fmla v20.4s, v30.4s, v23.4s\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "fmla v19.4s, v29.4s, v23.4s\n"
+    "fmla v18.4s, v28.4s, v23.4s\n"
+    "ldr q0, [x27, x11]\n"
+    "fmla v17.4s, v27.4s, v23.4s\n"
+    "fmla v16.4s, v26.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x0]\n"
+    "add %x[params], %x[params], #0x10\n"
+    "ldr q31, [x26, x11]\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "ldr q30, [x25, x11]\n"
+    "ldr q29, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "ldr q28, [x23, x11]\n"
+    "ldr q27, [x22, x11]\n"
+    "ldr x21, [x20], #0x8\n"
+    "ldr q26, [x21, x11]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla v25.4s, v2.4s, v23.4s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "fmla v24.4s, v1.4s, v23.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "fmla v22.4s, v0.4s, v23.4s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "fmla v21.4s, v31.4s, v23.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "fmla v20.4s, v30.4s, v23.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmla v19.4s, v29.4s, v23.4s\n"
+    "fmla v18.4s, v28.4s, v23.4s\n"
+    "fmla v17.4s, v27.4s, v23.4s\n"
+    "fmla v16.4s, v26.4s, v23.4s\n"
+    "fmax v25.4s, v25.4s, v4.4s\n"
+    "fmax v24.4s, v24.4s, v4.4s\n"
+    "fmax v22.4s, v22.4s, v4.4s\n"
+    "fmin v25.4s, v25.4s, v3.4s\n"
+    "str q25, [x27, x11]\n"
+    "fmin v24.4s, v24.4s, v3.4s\n"
+    "fmin v22.4s, v22.4s, v3.4s\n"
+    "str q24, [x26, x11]\n"
+    "fmax v21.4s, v21.4s, v4.4s\n"
+    "fmax v20.4s, v20.4s, v4.4s\n"
+    "str q22, [x25, x11]\n"
+    "fmax v19.4s, v19.4s, v4.4s\n"
+    "fmax v18.4s, v18.4s, v4.4s\n"
+    "fmin v21.4s, v21.4s, v3.4s\n"
+    "str q21, [x24, x11]\n"
+    "fmin v20.4s, v20.4s, v3.4s\n"
+    "fmin v19.4s, v19.4s, v3.4s\n"
+    "str q20, [x23, x11]\n"
+    "fmin v18.4s, v18.4s, v3.4s\n"
+    "fmax v17.4s, v17.4s, v4.4s\n"
+    "str q19, [x22, x11]\n"
+    "fmax v16.4s, v16.4s, v4.4s\n"
+    "str q18, [x21, x11]\n"
+    "fmin v17.4s, v17.4s, v3.4s\n"
+    "fmin v16.4s, v16.4s, v3.4s\n"
+    "str q17, [x20, x11]\n"
+    "str q16, [x19, x11]\n"
+    "add x11, x11, #0x10\n"
+    "cmp x11, x10, LSL #4\n"
+    "blt 1b\n"
+    "5:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 17f\n"
+    "movi v25.16b, #0x0\n"
+    "cbz %x[bias], 8f\n"
+    "add x19, %x[bias], x11\n"
+    "tbz %x[n_channels], #1, 6f\n"
+    "ld1 { v25.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v25.s }[2], [x19], #0x4\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_channels], #0, 7f\n"
+    "ld1 { v25.s }[0], [x19], #0x4\n"
+    "7:"  // Oddments: Load bias: Bit 1: End
+
+    "8:"  // Oddments: Load bias: Done
+    "mov v24.16b, v25.16b\n"
+    "ldr q23, [%x[params], #0x0]\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v22.16b, v25.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add %x[params], %x[params], #0x10\n"
+    "mov v21.16b, v25.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v20.16b, v25.16b\n"
+    "add x9, x9, x11\n"
+    "mov v19.16b, v25.16b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "mov v18.16b, v25.16b\n"
+    "add x28, x28, x11\n"
+    "mov v17.16b, v25.16b\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "mov v16.16b, v25.16b\n"
+    "add x27, x27, x11\n"
+    "ldr x21, [x20], #0x8\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 9f\n"
+    "ldr d2, [x9], #0x8\n"
+    "ldr d1, [x28], #0x8\n"
+    "ldr d0, [x27], #0x8\n"
+    "ldr d31, [x26], #0x8\n"
+    "ldr d30, [x25], #0x8\n"
+    "ldr d29, [x24], #0x8\n"
+    "ldr d28, [x23], #0x8\n"
+    "ldr d27, [x22], #0x8\n"
+    "ldr d26, [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v2.s }[2], [x9], #0x4\n"
+    "ld1 { v1.s }[2], [x28], #0x4\n"
+    "ld1 { v0.s }[2], [x27], #0x4\n"
+    "ld1 { v31.s }[2], [x26], #0x4\n"
+    "ld1 { v30.s }[2], [x25], #0x4\n"
+    "ld1 { v29.s }[2], [x24], #0x4\n"
+    "ld1 { v28.s }[2], [x23], #0x4\n"
+    "ld1 { v27.s }[2], [x22], #0x4\n"
+    "ld1 { v26.s }[2], [x21], #0x4\n"
+    "b 10f\n"
+    "9:"  // Oddments: Load: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ldr s2, [x9], #0x4\n"
+    "ldr s1, [x28], #0x4\n"
+    "ldr s0, [x27], #0x4\n"
+    "ldr s31, [x26], #0x4\n"
+    "ldr s30, [x25], #0x4\n"
+    "ldr s29, [x24], #0x4\n"
+    "ldr s28, [x23], #0x4\n"
+    "ldr s27, [x22], #0x4\n"
+    "ldr s26, [x21], #0x4\n"
+    "10:"  // Oddments: Load: Bit 1: End
+    "subs x19, %x[n_points], #0x1\n"
+    "ble 14f\n"
+    "11:"  // Oddments: Planar loop
+    "fmla v25.4s, v2.4s, v23.4s\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add x9, x9, x11\n"
+    "fmla v24.4s, v1.4s, v23.4s\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "fmla v22.4s, v0.4s, v23.4s\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "fmla v21.4s, v31.4s, v23.4s\n"
+    "add x28, x28, x11\n"
+    "fmla v20.4s, v30.4s, v23.4s\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "fmla v19.4s, v29.4s, v23.4s\n"
+    "add x27, x27, x11\n"
+    "fmla v18.4s, v28.4s, v23.4s\n"
+    "ldr x21, [x20], #0x8\n"
+    "fmla v17.4s, v27.4s, v23.4s\n"
+    "add x26, x26, x11\n"
+    "fmla v16.4s, v26.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x0]\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "add %x[params], %x[params], #0x10\n"
+    "tbz %x[n_channels], #1, 12f\n"
+    "ldr d2, [x9], #0x8\n"
+    "ldr d1, [x28], #0x8\n"
+    "ldr d0, [x27], #0x8\n"
+    "ldr d31, [x26], #0x8\n"
+    "ldr d30, [x25], #0x8\n"
+    "ldr d29, [x24], #0x8\n"
+    "ldr d28, [x23], #0x8\n"
+    "ldr d27, [x22], #0x8\n"
+    "ldr d26, [x21], #0x8\n"
+    "tbz %x[n_channels], #0, 13f\n"
+    "ld1 { v2.s }[2], [x9], #0x4\n"
+    "ld1 { v1.s }[2], [x28], #0x4\n"
+    "ld1 { v0.s }[2], [x27], #0x4\n"
+    "ld1 { v31.s }[2], [x26], #0x4\n"
+    "ld1 { v30.s }[2], [x25], #0x4\n"
+    "ld1 { v29.s }[2], [x24], #0x4\n"
+    "ld1 { v28.s }[2], [x23], #0x4\n"
+    "ld1 { v27.s }[2], [x22], #0x4\n"
+    "ld1 { v26.s }[2], [x21], #0x4\n"
+    "b 13f\n"
+    "12:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "tbz %x[n_channels], #0, 13f\n"
+    "ldr s2, [x9], #0x4\n"
+    "ldr s1, [x28], #0x4\n"
+    "ldr s0, [x27], #0x4\n"
+    "ldr s31, [x26], #0x4\n"
+    "ldr s30, [x25], #0x4\n"
+    "ldr s29, [x24], #0x4\n"
+    "ldr s28, [x23], #0x4\n"
+    "ldr s27, [x22], #0x4\n"
+    "ldr s26, [x21], #0x4\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: End
+    "subs x19, x19, #0x1\n"
+    "bgt 11b\n"
+    "14:"  // Oddments: Planar tail
+    "fmla v25.4s, v2.4s, v23.4s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "add x27, x27, x11\n"
+    "fmla v24.4s, v1.4s, v23.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "fmla v22.4s, v0.4s, v23.4s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "add x26, x26, x11\n"
+    "fmla v21.4s, v31.4s, v23.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "fmla v20.4s, v30.4s, v23.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x25, x25, x11\n"
+    "fmla v19.4s, v29.4s, v23.4s\n"
+    "add x24, x24, x11\n"
+    "fmla v18.4s, v28.4s, v23.4s\n"
+    "add x23, x23, x11\n"
+    "fmla v17.4s, v27.4s, v23.4s\n"
+    "add x22, x22, x11\n"
+    "fmla v16.4s, v26.4s, v23.4s\n"
+    "add x21, x21, x11\n"
+    "fmax v25.4s, v25.4s, v4.4s\n"
+    "add x20, x20, x11\n"
+    "fmax v24.4s, v24.4s, v4.4s\n"
+    "add x19, x19, x11\n"
+    "fmax v22.4s, v22.4s, v4.4s\n"
+    "fmin v25.4s, v25.4s, v3.4s\n"
+    "fmin v24.4s, v24.4s, v3.4s\n"
+    "fmin v22.4s, v22.4s, v3.4s\n"
+    "fmax v21.4s, v21.4s, v4.4s\n"
+    "fmax v20.4s, v20.4s, v4.4s\n"
+    "fmax v19.4s, v19.4s, v4.4s\n"
+    "fmin v21.4s, v21.4s, v3.4s\n"
+    "fmin v20.4s, v20.4s, v3.4s\n"
+    "fmin v19.4s, v19.4s, v3.4s\n"
+    "fmax v18.4s, v18.4s, v4.4s\n"
+    "fmax v17.4s, v17.4s, v4.4s\n"
+    "fmax v16.4s, v16.4s, v4.4s\n"
+    "fmin v18.4s, v18.4s, v3.4s\n"
+    "fmin v17.4s, v17.4s, v3.4s\n"
+    "fmin v16.4s, v16.4s, v3.4s\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "st1 { v25.d }[0], [x27], #0x8\n"
+    "st1 { v24.d }[0], [x26], #0x8\n"
+    "st1 { v22.d }[0], [x25], #0x8\n"
+    "st1 { v21.d }[0], [x24], #0x8\n"
+    "st1 { v20.d }[0], [x23], #0x8\n"
+    "st1 { v19.d }[0], [x22], #0x8\n"
+    "st1 { v18.d }[0], [x21], #0x8\n"
+    "st1 { v17.d }[0], [x20], #0x8\n"
+    "st1 { v16.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 16f\n"
+    "st1 { v25.s }[2], [x27], #0x4\n"
+    "st1 { v24.s }[2], [x26], #0x4\n"
+    "st1 { v22.s }[2], [x25], #0x4\n"
+    "st1 { v21.s }[2], [x24], #0x4\n"
+    "st1 { v20.s }[2], [x23], #0x4\n"
+    "st1 { v19.s }[2], [x22], #0x4\n"
+    "st1 { v18.s }[2], [x21], #0x4\n"
+    "st1 { v17.s }[2], [x20], #0x4\n"
+    "st1 { v16.s }[2], [x19], #0x4\n"
+    "b 16f\n"
+    "15:"  // Oddments: Store: Bit 1: Unset
+    "tbz %x[n_channels], #0, 16f\n"
+    "st1 { v25.s }[0], [x27], #0x4\n"
+    "st1 { v24.s }[0], [x26], #0x4\n"
+    "st1 { v22.s }[0], [x25], #0x4\n"
+    "st1 { v21.s }[0], [x24], #0x4\n"
+    "st1 { v20.s }[0], [x23], #0x4\n"
+    "st1 { v19.s }[0], [x22], #0x4\n"
+    "st1 { v18.s }[0], [x21], #0x4\n"
+    "st1 { v17.s }[0], [x20], #0x4\n"
+    "st1 { v16.s }[0], [x19], #0x4\n"
+    "16:"  // Oddments: Store: Bit 1: End
+
+    "17:"  // End
+
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000..60f5ddd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  constexpr static unsigned int input_rows = 7;
+  constexpr static unsigned int input_cols = 7;
+  constexpr static unsigned int input_col_quads = 2;
+
+  kern_type kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+
+  a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..5e334ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp

@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ldp x14, x13, [%x[outptrs], #0x0]\n"
+    "add x12, %x[clamps], #0x4\n"
+    "ldp x11, x10, [%x[outptrs], #0x10]\n"
+    "mov x9, #0x0\n"
+    "ldp x28, x27, [%x[outptrs], #0x20]\n"
+    "mov x26, #0x0\n"
+    "ldp x25, x24, [%x[outptrs], #0x30]\n"
+    "lsr x23, %x[channel_multiplier], #0x2\n"
+    "ldr x22, [%x[outptrs], #0x40]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ldr x19, [%x[inptrs], #0x10]\n"
+    "ldr q0, [x21, #0x0]\n"
+    "ldr q1, [x21, #0x10]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "ldr q4, [x19, #0x0]\n"
+    "ldr q5, [x19, #0x10]\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ldr x19, [%x[inptrs], #0x28]\n"
+    "ldr q6, [x21, #0x0]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "ldr q10, [x19, #0x0]\n"
+    "ldr q11, [x19, #0x10]\n"
+    "ldr x19, [%x[inptrs], #0x30]\n"
+    "ld1r { v24.4s }, [%x[clamps]]\n"
+    "ld1r { v23.4s }, [x12]\n"
+    "ldr q12, [x19, #0x0]\n"
+    "ldr q13, [x19, #0x10]\n"
+    "cbz x23, 3f\n"
+    "ldr q14, [%x[params], #0x0]\n"
+    "mov v15.16b, v14.16b\n"
+    "ldr q31, [%x[params], #0x10]\n"
+    "subs x23, x23, #0x1\n"
+    "mov v16.16b, v14.16b\n"
+    "ldr q30, [%x[params], #0x20]\n"
+    "mov v17.16b, v14.16b\n"
+    "ldr q29, [%x[params], #0x30]\n"
+    "add %x[params], %x[params], #0x40\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "mov v22.16b, v14.16b\n"
+    "beq 2f\n"
+    "1:"  // Output channel complete vector loop
+    "fmla v14.4s, v31.4s, v0.s[0]\n"
+    "add x9, x9, #0x4\n"
+    "fmla v15.4s, v31.4s, v0.s[2]\n"
+    "subs x23, x23, #0x1\n"
+    "fmla v16.4s, v31.4s, v1.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[0]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v5.s[0]\n"
+    "fmla v20.4s, v31.4s, v8.s[0]\n"
+    "fmla v21.4s, v31.4s, v8.s[2]\n"
+    "fmla v22.4s, v31.4s, v9.s[0]\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "fmla v14.4s, v30.4s, v0.s[1]\n"
+    "fmla v15.4s, v30.4s, v0.s[3]\n"
+    "fmla v16.4s, v30.4s, v1.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[1]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[1]\n"
+    "fmla v20.4s, v30.4s, v8.s[1]\n"
+    "fmla v21.4s, v30.4s, v8.s[3]\n"
+    "fmla v22.4s, v30.4s, v9.s[1]\n"
+    "ldr q30, [%x[params], #0x10]\n"
+    "fmla v14.4s, v29.4s, v0.s[2]\n"
+    "fmla v15.4s, v29.4s, v1.s[0]\n"
+    "fmla v16.4s, v29.4s, v1.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[2]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[2]\n"
+    "fmla v20.4s, v29.4s, v8.s[2]\n"
+    "fmla v21.4s, v29.4s, v9.s[0]\n"
+    "fmla v22.4s, v29.4s, v9.s[2]\n"
+    "ldr q29, [%x[params], #0x20]\n"
+    "fmla v14.4s, v31.4s, v2.s[0]\n"
+    "fmla v15.4s, v31.4s, v2.s[2]\n"
+    "fmla v16.4s, v31.4s, v3.s[0]\n"
+    "fmla v17.4s, v31.4s, v6.s[0]\n"
+    "fmla v18.4s, v31.4s, v6.s[2]\n"
+    "fmla v19.4s, v31.4s, v7.s[0]\n"
+    "fmla v20.4s, v31.4s, v10.s[0]\n"
+    "fmla v21.4s, v31.4s, v10.s[2]\n"
+    "fmla v22.4s, v31.4s, v11.s[0]\n"
+    "ldr q31, [%x[params], #0x30]\n"
+    "fmla v14.4s, v30.4s, v2.s[1]\n"
+    "fmla v15.4s, v30.4s, v2.s[3]\n"
+    "fmla v16.4s, v30.4s, v3.s[1]\n"
+    "fmla v17.4s, v30.4s, v6.s[1]\n"
+    "fmla v18.4s, v30.4s, v6.s[3]\n"
+    "fmla v19.4s, v30.4s, v7.s[1]\n"
+    "fmla v20.4s, v30.4s, v10.s[1]\n"
+    "fmla v21.4s, v30.4s, v10.s[3]\n"
+    "fmla v22.4s, v30.4s, v11.s[1]\n"
+    "ldr q30, [%x[params], #0x40]\n"
+    "fmla v14.4s, v29.4s, v2.s[2]\n"
+    "fmla v15.4s, v29.4s, v3.s[0]\n"
+    "fmla v16.4s, v29.4s, v3.s[2]\n"
+    "fmla v17.4s, v29.4s, v6.s[2]\n"
+    "fmla v18.4s, v29.4s, v7.s[0]\n"
+    "fmla v19.4s, v29.4s, v7.s[2]\n"
+    "fmla v20.4s, v29.4s, v10.s[2]\n"
+    "fmla v21.4s, v29.4s, v11.s[0]\n"
+    "fmla v22.4s, v29.4s, v11.s[2]\n"
+    "ldr q29, [%x[params], #0x50]\n"
+    "fmla v14.4s, v31.4s, v4.s[0]\n"
+    "fmla v15.4s, v31.4s, v4.s[2]\n"
+    "fmla v16.4s, v31.4s, v5.s[0]\n"
+    "fmla v17.4s, v31.4s, v8.s[0]\n"
+    "fmla v18.4s, v31.4s, v8.s[2]\n"
+    "fmla v19.4s, v31.4s, v9.s[0]\n"
+    "fmla v20.4s, v31.4s, v12.s[0]\n"
+    "fmla v21.4s, v31.4s, v12.s[2]\n"
+    "fmla v22.4s, v31.4s, v13.s[0]\n"
+    "ldr q31, [%x[params], #0x70]\n"
+    "fmla v14.4s, v30.4s, v4.s[1]\n"
+    "fmla v15.4s, v30.4s, v4.s[3]\n"
+    "fmla v16.4s, v30.4s, v5.s[1]\n"
+    "fmla v17.4s, v30.4s, v8.s[1]\n"
+    "fmla v18.4s, v30.4s, v8.s[3]\n"
+    "fmla v19.4s, v30.4s, v9.s[1]\n"
+    "fmla v20.4s, v30.4s, v12.s[1]\n"
+    "fmla v21.4s, v30.4s, v12.s[3]\n"
+    "fmla v22.4s, v30.4s, v13.s[1]\n"
+    "ldr q30, [%x[params], #0x80]\n"
+    "fmla v14.4s, v29.4s, v4.s[2]\n"
+    "fmla v15.4s, v29.4s, v5.s[0]\n"
+    "fmla v16.4s, v29.4s, v5.s[2]\n"
+    "fmla v17.4s, v29.4s, v8.s[2]\n"
+    "fmla v18.4s, v29.4s, v9.s[0]\n"
+    "fmla v19.4s, v29.4s, v9.s[2]\n"
+    "fmla v20.4s, v29.4s, v12.s[2]\n"
+    "fmla v21.4s, v29.4s, v13.s[0]\n"
+    "fmla v22.4s, v29.4s, v13.s[2]\n"
+    "ldr q29, [%x[params], #0x90]\n"
+    "fmin v14.4s, v14.4s, v23.4s\n"
+    "fmin v15.4s, v15.4s, v23.4s\n"
+    "fmin v16.4s, v16.4s, v23.4s\n"
+    "fmax v14.4s, v14.4s, v24.4s\n"
+    "str q14, [x14, x26]\n"
+    "fmax v15.4s, v15.4s, v24.4s\n"
+    "fmax v16.4s, v16.4s, v24.4s\n"
+    "ldr q14, [%x[params], #0x60]\n"
+    "add %x[params], %x[params], #0xa0\n"
+    "fmin v17.4s, v17.4s, v23.4s\n"
+    "str q15, [x13, x26]\n"
+    "fmin v18.4s, v18.4s, v23.4s\n"
+    "fmin v19.4s, v19.4s, v23.4s\n"
+    "str q16, [x11, x26]\n"
+    "fmin v20.4s, v20.4s, v23.4s\n"
+    "fmax v17.4s, v17.4s, v24.4s\n"
+    "str q17, [x10, x26]\n"
+    "fmax v18.4s, v18.4s, v24.4s\n"
+    "fmax v19.4s, v19.4s, v24.4s\n"
+    "str q18, [x28, x26]\n"
+    "fmax v20.4s, v20.4s, v24.4s\n"
+    "fmin v21.4s, v21.4s, v23.4s\n"
+    "str q19, [x27, x26]\n"
+    "fmin v22.4s, v22.4s, v23.4s\n"
+    "str q20, [x25, x26]\n"
+    "fmax v21.4s, v21.4s, v24.4s\n"
+    "mov v15.16b, v14.16b\n"
+    "str q21, [x24, x26]\n"
+    "fmax v22.4s, v22.4s, v24.4s\n"
+    "mov v16.16b, v14.16b\n"
+    "str q22, [x22, x26]\n"
+    "mov v17.16b, v14.16b\n"
+    "add x26, x26, #0x10\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "mov v22.16b, v14.16b\n"
+    "bgt 1b\n"
+    "2:"  // Output channel complete vector tail
+    "fmla v14.4s, v31.4s, v0.s[0]\n"
+    "fmla v15.4s, v31.4s, v0.s[2]\n"
+    "fmla v16.4s, v31.4s, v1.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[0]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v5.s[0]\n"
+    "fmla v20.4s, v31.4s, v8.s[0]\n"
+    "fmla v21.4s, v31.4s, v8.s[2]\n"
+    "fmla v22.4s, v31.4s, v9.s[0]\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "fmla v14.4s, v30.4s, v0.s[1]\n"
+    "fmla v15.4s, v30.4s, v0.s[3]\n"
+    "fmla v16.4s, v30.4s, v1.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[1]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[1]\n"
+    "fmla v20.4s, v30.4s, v8.s[1]\n"
+    "fmla v21.4s, v30.4s, v8.s[3]\n"
+    "fmla v22.4s, v30.4s, v9.s[1]\n"
+    "ldr q30, [%x[params], #0x10]\n"
+    "fmla v14.4s, v29.4s, v0.s[2]\n"
+    "fmla v15.4s, v29.4s, v1.s[0]\n"
+    "fmla v16.4s, v29.4s, v1.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[2]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[2]\n"
+    "fmla v20.4s, v29.4s, v8.s[2]\n"
+    "fmla v21.4s, v29.4s, v9.s[0]\n"
+    "fmla v22.4s, v29.4s, v9.s[2]\n"
+    "ldr q29, [%x[params], #0x20]\n"
+    "fmla v14.4s, v31.4s, v2.s[0]\n"
+    "fmla v15.4s, v31.4s, v2.s[2]\n"
+    "fmla v16.4s, v31.4s, v3.s[0]\n"
+    "fmla v17.4s, v31.4s, v6.s[0]\n"
+    "fmla v18.4s, v31.4s, v6.s[2]\n"
+    "fmla v19.4s, v31.4s, v7.s[0]\n"
+    "fmla v20.4s, v31.4s, v10.s[0]\n"
+    "fmla v21.4s, v31.4s, v10.s[2]\n"
+    "fmla v22.4s, v31.4s, v11.s[0]\n"
+    "ldr q31, [%x[params], #0x30]\n"
+    "fmla v14.4s, v30.4s, v2.s[1]\n"
+    "fmla v15.4s, v30.4s, v2.s[3]\n"
+    "fmla v16.4s, v30.4s, v3.s[1]\n"
+    "fmla v17.4s, v30.4s, v6.s[1]\n"
+    "fmla v18.4s, v30.4s, v6.s[3]\n"
+    "fmla v19.4s, v30.4s, v7.s[1]\n"
+    "fmla v20.4s, v30.4s, v10.s[1]\n"
+    "fmla v21.4s, v30.4s, v10.s[3]\n"
+    "fmla v22.4s, v30.4s, v11.s[1]\n"
+    "ldr q30, [%x[params], #0x40]\n"
+    "fmla v14.4s, v29.4s, v2.s[2]\n"
+    "fmla v15.4s, v29.4s, v3.s[0]\n"
+    "fmla v16.4s, v29.4s, v3.s[2]\n"
+    "fmla v17.4s, v29.4s, v6.s[2]\n"
+    "fmla v18.4s, v29.4s, v7.s[0]\n"
+    "fmla v19.4s, v29.4s, v7.s[2]\n"
+    "fmla v20.4s, v29.4s, v10.s[2]\n"
+    "fmla v21.4s, v29.4s, v11.s[0]\n"
+    "fmla v22.4s, v29.4s, v11.s[2]\n"
+    "ldr q29, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "fmla v14.4s, v31.4s, v4.s[0]\n"
+    "fmla v15.4s, v31.4s, v4.s[2]\n"
+    "fmla v16.4s, v31.4s, v5.s[0]\n"
+    "fmla v17.4s, v31.4s, v8.s[0]\n"
+    "fmla v18.4s, v31.4s, v8.s[2]\n"
+    "fmla v19.4s, v31.4s, v9.s[0]\n"
+    "fmla v20.4s, v31.4s, v12.s[0]\n"
+    "fmla v21.4s, v31.4s, v12.s[2]\n"
+    "fmla v22.4s, v31.4s, v13.s[0]\n"
+    "fmla v14.4s, v30.4s, v4.s[1]\n"
+    "fmla v15.4s, v30.4s, v4.s[3]\n"
+    "fmla v16.4s, v30.4s, v5.s[1]\n"
+    "fmla v17.4s, v30.4s, v8.s[1]\n"
+    "fmla v18.4s, v30.4s, v8.s[3]\n"
+    "fmla v19.4s, v30.4s, v9.s[1]\n"
+    "fmla v20.4s, v30.4s, v12.s[1]\n"
+    "fmla v21.4s, v30.4s, v12.s[3]\n"
+    "fmla v22.4s, v30.4s, v13.s[1]\n"
+    "fmla v14.4s, v29.4s, v4.s[2]\n"
+    "fmla v15.4s, v29.4s, v5.s[0]\n"
+    "fmla v16.4s, v29.4s, v5.s[2]\n"
+    "fmla v17.4s, v29.4s, v8.s[2]\n"
+    "fmla v18.4s, v29.4s, v9.s[0]\n"
+    "fmla v19.4s, v29.4s, v9.s[2]\n"
+    "fmla v20.4s, v29.4s, v12.s[2]\n"
+    "fmla v21.4s, v29.4s, v13.s[0]\n"
+    "fmla v22.4s, v29.4s, v13.s[2]\n"
+    "fmin v14.4s, v14.4s, v23.4s\n"
+    "fmin v15.4s, v15.4s, v23.4s\n"
+    "fmin v16.4s, v16.4s, v23.4s\n"
+    "fmax v14.4s, v14.4s, v24.4s\n"
+    "str q14, [x14, x26]\n"
+    "fmax v15.4s, v15.4s, v24.4s\n"
+    "fmax v16.4s, v16.4s, v24.4s\n"
+    "str q15, [x13, x26]\n"
+    "fmin v17.4s, v17.4s, v23.4s\n"
+    "fmin v18.4s, v18.4s, v23.4s\n"
+    "str q16, [x11, x26]\n"
+    "fmin v19.4s, v19.4s, v23.4s\n"
+    "fmin v20.4s, v20.4s, v23.4s\n"
+    "fmax v17.4s, v17.4s, v24.4s\n"
+    "str q17, [x10, x26]\n"
+    "fmax v18.4s, v18.4s, v24.4s\n"
+    "fmax v19.4s, v19.4s, v24.4s\n"
+    "str q18, [x28, x26]\n"
+    "fmax v20.4s, v20.4s, v24.4s\n"
+    "fmin v21.4s, v21.4s, v23.4s\n"
+    "str q19, [x27, x26]\n"
+    "fmin v22.4s, v22.4s, v23.4s\n"
+    "str q20, [x25, x26]\n"
+    "fmax v21.4s, v21.4s, v24.4s\n"
+    "fmax v22.4s, v22.4s, v24.4s\n"
+    "str q21, [x24, x26]\n"
+    "str q22, [x22, x26]\n"
+    "add x26, x26, #0x10\n"
+    "3:"  // Output channel oddments
+    "tst %x[channel_multiplier], #0x3\n"
+    "beq 6f\n"
+    "ldr q14, [%x[params], #0x0]\n"
+    "mov v15.16b, v14.16b\n"
+    "ldr q31, [%x[params], #0x10]\n"
+    "mov v16.16b, v14.16b\n"
+    "ldr q30, [%x[params], #0x20]\n"
+    "mov v17.16b, v14.16b\n"
+    "ldr q29, [%x[params], #0x30]\n"
+    "mov v18.16b, v14.16b\n"
+    "mov v19.16b, v14.16b\n"
+    "mov v20.16b, v14.16b\n"
+    "mov v21.16b, v14.16b\n"
+    "mov v22.16b, v14.16b\n"
+    "fmla v14.4s, v31.4s, v0.s[0]\n"
+    "fmla v15.4s, v31.4s, v0.s[2]\n"
+    "fmla v16.4s, v31.4s, v1.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[0]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v5.s[0]\n"
+    "fmla v20.4s, v31.4s, v8.s[0]\n"
+    "fmla v21.4s, v31.4s, v8.s[2]\n"
+    "fmla v22.4s, v31.4s, v9.s[0]\n"
+    "ldr q31, [%x[params], #0x40]\n"
+    "fmla v14.4s, v30.4s, v0.s[1]\n"
+    "fmla v15.4s, v30.4s, v0.s[3]\n"
+    "fmla v16.4s, v30.4s, v1.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[1]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[1]\n"
+    "fmla v20.4s, v30.4s, v8.s[1]\n"
+    "fmla v21.4s, v30.4s, v8.s[3]\n"
+    "fmla v22.4s, v30.4s, v9.s[1]\n"
+    "ldr q30, [%x[params], #0x50]\n"
+    "fmla v14.4s, v29.4s, v0.s[2]\n"
+    "fmla v15.4s, v29.4s, v1.s[0]\n"
+    "fmla v16.4s, v29.4s, v1.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[2]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[2]\n"
+    "fmla v20.4s, v29.4s, v8.s[2]\n"
+    "fmla v21.4s, v29.4s, v9.s[0]\n"
+    "fmla v22.4s, v29.4s, v9.s[2]\n"
+    "ldr q29, [%x[params], #0x60]\n"
+    "fmla v14.4s, v31.4s, v2.s[0]\n"
+    "fmla v15.4s, v31.4s, v2.s[2]\n"
+    "fmla v16.4s, v31.4s, v3.s[0]\n"
+    "fmla v17.4s, v31.4s, v6.s[0]\n"
+    "fmla v18.4s, v31.4s, v6.s[2]\n"
+    "fmla v19.4s, v31.4s, v7.s[0]\n"
+    "fmla v20.4s, v31.4s, v10.s[0]\n"
+    "fmla v21.4s, v31.4s, v10.s[2]\n"
+    "fmla v22.4s, v31.4s, v11.s[0]\n"
+    "ldr q31, [%x[params], #0x70]\n"
+    "fmla v14.4s, v30.4s, v2.s[1]\n"
+    "fmla v15.4s, v30.4s, v2.s[3]\n"
+    "fmla v16.4s, v30.4s, v3.s[1]\n"
+    "fmla v17.4s, v30.4s, v6.s[1]\n"
+    "fmla v18.4s, v30.4s, v6.s[3]\n"
+    "fmla v19.4s, v30.4s, v7.s[1]\n"
+    "fmla v20.4s, v30.4s, v10.s[1]\n"
+    "fmla v21.4s, v30.4s, v10.s[3]\n"
+    "fmla v22.4s, v30.4s, v11.s[1]\n"
+    "ldr q30, [%x[params], #0x80]\n"
+    "fmla v14.4s, v29.4s, v2.s[2]\n"
+    "fmla v15.4s, v29.4s, v3.s[0]\n"
+    "fmla v16.4s, v29.4s, v3.s[2]\n"
+    "fmla v17.4s, v29.4s, v6.s[2]\n"
+    "fmla v18.4s, v29.4s, v7.s[0]\n"
+    "fmla v19.4s, v29.4s, v7.s[2]\n"
+    "fmla v20.4s, v29.4s, v10.s[2]\n"
+    "fmla v21.4s, v29.4s, v11.s[0]\n"
+    "fmla v22.4s, v29.4s, v11.s[2]\n"
+    "ldr q29, [%x[params], #0x90]\n"
+    "add %x[params], %x[params], #0xa0\n"
+    "fmla v14.4s, v31.4s, v4.s[0]\n"
+    "fmla v15.4s, v31.4s, v4.s[2]\n"
+    "fmla v16.4s, v31.4s, v5.s[0]\n"
+    "fmla v17.4s, v31.4s, v8.s[0]\n"
+    "fmla v18.4s, v31.4s, v8.s[2]\n"
+    "fmla v19.4s, v31.4s, v9.s[0]\n"
+    "fmla v20.4s, v31.4s, v12.s[0]\n"
+    "fmla v21.4s, v31.4s, v12.s[2]\n"
+    "fmla v22.4s, v31.4s, v13.s[0]\n"
+    "fmla v14.4s, v30.4s, v4.s[1]\n"
+    "fmla v15.4s, v30.4s, v4.s[3]\n"
+    "fmla v16.4s, v30.4s, v5.s[1]\n"
+    "fmla v17.4s, v30.4s, v8.s[1]\n"
+    "fmla v18.4s, v30.4s, v8.s[3]\n"
+    "fmla v19.4s, v30.4s, v9.s[1]\n"
+    "fmla v20.4s, v30.4s, v12.s[1]\n"
+    "fmla v21.4s, v30.4s, v12.s[3]\n"
+    "fmla v22.4s, v30.4s, v13.s[1]\n"
+    "fmla v14.4s, v29.4s, v4.s[2]\n"
+    "fmla v15.4s, v29.4s, v5.s[0]\n"
+    "fmla v16.4s, v29.4s, v5.s[2]\n"
+    "fmla v17.4s, v29.4s, v8.s[2]\n"
+    "fmla v18.4s, v29.4s, v9.s[0]\n"
+    "fmla v19.4s, v29.4s, v9.s[2]\n"
+    "fmla v20.4s, v29.4s, v12.s[2]\n"
+    "fmla v21.4s, v29.4s, v13.s[0]\n"
+    "fmla v22.4s, v29.4s, v13.s[2]\n"
+    "fmin v14.4s, v14.4s, v23.4s\n"
+    "fmin v15.4s, v15.4s, v23.4s\n"
+    "fmin v16.4s, v16.4s, v23.4s\n"
+    "fmax v14.4s, v14.4s, v24.4s\n"
+    "fmax v15.4s, v15.4s, v24.4s\n"
+    "fmax v16.4s, v16.4s, v24.4s\n"
+    "fmin v17.4s, v17.4s, v23.4s\n"
+    "fmin v18.4s, v18.4s, v23.4s\n"
+    "fmin v19.4s, v19.4s, v23.4s\n"
+    "fmax v17.4s, v17.4s, v24.4s\n"
+    "fmax v18.4s, v18.4s, v24.4s\n"
+    "fmax v19.4s, v19.4s, v24.4s\n"
+    "fmin v20.4s, v20.4s, v23.4s\n"
+    "fmin v21.4s, v21.4s, v23.4s\n"
+    "fmin v22.4s, v22.4s, v23.4s\n"
+    "fmax v20.4s, v20.4s, v24.4s\n"
+    "fmax v21.4s, v21.4s, v24.4s\n"
+    "fmax v22.4s, v22.4s, v24.4s\n"
+    "tbz %x[channel_multiplier], #1, 4f\n"
+    "add x19, x14, x26\n"
+    "st1 { v14.d }[0], [x19]\n"
+    "add x19, x13, x26\n"
+    "st1 { v15.d }[0], [x19]\n"
+    "add x19, x11, x26\n"
+    "st1 { v16.d }[0], [x19]\n"
+    "add x19, x10, x26\n"
+    "st1 { v17.d }[0], [x19]\n"
+    "add x19, x28, x26\n"
+    "st1 { v18.d }[0], [x19]\n"
+    "add x19, x27, x26\n"
+    "st1 { v19.d }[0], [x19]\n"
+    "add x19, x25, x26\n"
+    "st1 { v20.d }[0], [x19]\n"
+    "add x19, x24, x26\n"
+    "st1 { v21.d }[0], [x19]\n"
+    "add x19, x22, x26\n"
+    "st1 { v22.d }[0], [x19]\n"
+    "add x26, x26, #0x8\n"
+    "tbz %x[channel_multiplier], #0, 5f\n"
+    "add x19, x14, x26\n"
+    "st1 { v14.s }[2], [x19]\n"
+    "add x19, x13, x26\n"
+    "st1 { v15.s }[2], [x19]\n"
+    "add x19, x11, x26\n"
+    "st1 { v16.s }[2], [x19]\n"
+    "add x19, x10, x26\n"
+    "st1 { v17.s }[2], [x19]\n"
+    "add x19, x28, x26\n"
+    "st1 { v18.s }[2], [x19]\n"
+    "add x19, x27, x26\n"
+    "st1 { v19.s }[2], [x19]\n"
+    "add x19, x25, x26\n"
+    "st1 { v20.s }[2], [x19]\n"
+    "add x19, x24, x26\n"
+    "st1 { v21.s }[2], [x19]\n"
+    "add x19, x22, x26\n"
+    "st1 { v22.s }[2], [x19]\n"
+    "b 5f\n"
+    "4:"  // Output channel oddments: Store: Bit 1: Unset
+    "tbz %x[channel_multiplier], #0, 5f\n"
+    "add x19, x14, x26\n"
+    "st1 { v14.s }[0], [x19]\n"
+    "add x19, x13, x26\n"
+    "st1 { v15.s }[0], [x19]\n"
+    "add x19, x11, x26\n"
+    "st1 { v16.s }[0], [x19]\n"
+    "add x19, x10, x26\n"
+    "st1 { v17.s }[0], [x19]\n"
+    "add x19, x28, x26\n"
+    "st1 { v18.s }[0], [x19]\n"
+    "add x19, x27, x26\n"
+    "st1 { v19.s }[0], [x19]\n"
+    "add x19, x25, x26\n"
+    "st1 { v20.s }[0], [x19]\n"
+    "add x19, x24, x26\n"
+    "st1 { v21.s }[0], [x19]\n"
+    "add x19, x22, x26\n"
+    "st1 { v22.s }[0], [x19]\n"
+    "5:"  // Output channel oddments: Store: Bit 1: End
+
+    "6:"  // End
+
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000..92d6a75
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 8;
+  constexpr static unsigned int input_col_quads = 2;
+
+  kern_type kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+
+  a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..6e9e97f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp

@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ldp x13, x12, [%x[outptrs], #0x0]\n"
+    "add x11, %x[clamps], #0x4\n"
+    "ldp x10, x9, [%x[outptrs], #0x10]\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[outptrs], #0x20]\n"
+    "mov x25, #0x0\n"
+    "ldp x24, x23, [%x[outptrs], #0x30]\n"
+    "lsr x22, %x[channel_multiplier], #0x2\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ldr x19, [%x[inptrs], #0x10]\n"
+    "ldr q0, [x21, #0x0]\n"
+    "ldr q1, [x21, #0x10]\n"
+    "ldr q2, [x20, #0x0]\n"
+    "ldr q3, [x20, #0x10]\n"
+    "ldr q4, [x19, #0x0]\n"
+    "ldr q5, [x19, #0x10]\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ldr x19, [%x[inptrs], #0x28]\n"
+    "ldr q6, [x21, #0x0]\n"
+    "ldr q7, [x21, #0x10]\n"
+    "ldr q8, [x20, #0x0]\n"
+    "ldr q9, [x20, #0x10]\n"
+    "ldr q10, [x19, #0x0]\n"
+    "ldr q11, [x19, #0x10]\n"
+    "ld1r { v21.4s }, [%x[clamps]]\n"
+    "ld1r { v20.4s }, [x11]\n"
+    "cbz x22, 3f\n"
+    "ldr q12, [%x[params], #0x0]\n"
+    "mov v13.16b, v12.16b\n"
+    "ldr q31, [%x[params], #0x10]\n"
+    "subs x22, x22, #0x1\n"
+    "mov v14.16b, v12.16b\n"
+    "ldr q30, [%x[params], #0x20]\n"
+    "mov v15.16b, v12.16b\n"
+    "ldr q29, [%x[params], #0x30]\n"
+    "mov v16.16b, v12.16b\n"
+    "ldr q28, [%x[params], #0x40]\n"
+    "mov v17.16b, v12.16b\n"
+    "ldr q27, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "mov v18.16b, v12.16b\n"
+    "mov v19.16b, v12.16b\n"
+    "beq 2f\n"
+    "1:"  // Output channel complete vector loop
+    "fmla v12.4s, v31.4s, v0.s[0]\n"
+    "add x28, x28, #0x4\n"
+    "fmla v13.4s, v31.4s, v0.s[1]\n"
+    "subs x22, x22, #0x1\n"
+    "fmla v14.4s, v31.4s, v0.s[2]\n"
+    "fmla v15.4s, v31.4s, v0.s[3]\n"
+    "fmla v16.4s, v31.4s, v2.s[0]\n"
+    "fmla v17.4s, v31.4s, v2.s[1]\n"
+    "fmla v18.4s, v31.4s, v2.s[2]\n"
+    "fmla v19.4s, v31.4s, v2.s[3]\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "fmla v12.4s, v30.4s, v0.s[1]\n"
+    "fmla v13.4s, v30.4s, v0.s[2]\n"
+    "fmla v14.4s, v30.4s, v0.s[3]\n"
+    "fmla v15.4s, v30.4s, v1.s[0]\n"
+    "fmla v16.4s, v30.4s, v2.s[1]\n"
+    "fmla v17.4s, v30.4s, v2.s[2]\n"
+    "fmla v18.4s, v30.4s, v2.s[3]\n"
+    "fmla v19.4s, v30.4s, v3.s[0]\n"
+    "ldr q30, [%x[params], #0x10]\n"
+    "fmla v12.4s, v29.4s, v0.s[2]\n"
+    "fmla v13.4s, v29.4s, v0.s[3]\n"
+    "fmla v14.4s, v29.4s, v1.s[0]\n"
+    "fmla v15.4s, v29.4s, v1.s[1]\n"
+    "fmla v16.4s, v29.4s, v2.s[2]\n"
+    "fmla v17.4s, v29.4s, v2.s[3]\n"
+    "fmla v18.4s, v29.4s, v3.s[0]\n"
+    "fmla v19.4s, v29.4s, v3.s[1]\n"
+    "ldr q29, [%x[params], #0x20]\n"
+    "fmla v12.4s, v28.4s, v0.s[3]\n"
+    "fmla v13.4s, v28.4s, v1.s[0]\n"
+    "fmla v14.4s, v28.4s, v1.s[1]\n"
+    "fmla v15.4s, v28.4s, v1.s[2]\n"
+    "fmla v16.4s, v28.4s, v2.s[3]\n"
+    "fmla v17.4s, v28.4s, v3.s[0]\n"
+    "fmla v18.4s, v28.4s, v3.s[1]\n"
+    "fmla v19.4s, v28.4s, v3.s[2]\n"
+    "ldr q28, [%x[params], #0x30]\n"
+    "fmla v12.4s, v27.4s, v1.s[0]\n"
+    "fmla v13.4s, v27.4s, v1.s[1]\n"
+    "fmla v14.4s, v27.4s, v1.s[2]\n"
+    "fmla v15.4s, v27.4s, v1.s[3]\n"
+    "fmla v16.4s, v27.4s, v3.s[0]\n"
+    "fmla v17.4s, v27.4s, v3.s[1]\n"
+    "fmla v18.4s, v27.4s, v3.s[2]\n"
+    "fmla v19.4s, v27.4s, v3.s[3]\n"
+    "ldr q27, [%x[params], #0x40]\n"
+    "fmla v12.4s, v31.4s, v2.s[0]\n"
+    "fmla v13.4s, v31.4s, v2.s[1]\n"
+    "fmla v14.4s, v31.4s, v2.s[2]\n"
+    "fmla v15.4s, v31.4s, v2.s[3]\n"
+    "fmla v16.4s, v31.4s, v4.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[1]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v4.s[3]\n"
+    "ldr q31, [%x[params], #0x50]\n"
+    "fmla v12.4s, v30.4s, v2.s[1]\n"
+    "fmla v13.4s, v30.4s, v2.s[2]\n"
+    "fmla v14.4s, v30.4s, v2.s[3]\n"
+    "fmla v15.4s, v30.4s, v3.s[0]\n"
+    "fmla v16.4s, v30.4s, v4.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[2]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[0]\n"
+    "ldr q30, [%x[params], #0x60]\n"
+    "fmla v12.4s, v29.4s, v2.s[2]\n"
+    "fmla v13.4s, v29.4s, v2.s[3]\n"
+    "fmla v14.4s, v29.4s, v3.s[0]\n"
+    "fmla v15.4s, v29.4s, v3.s[1]\n"
+    "fmla v16.4s, v29.4s, v4.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[3]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[1]\n"
+    "ldr q29, [%x[params], #0x70]\n"
+    "fmla v12.4s, v28.4s, v2.s[3]\n"
+    "fmla v13.4s, v28.4s, v3.s[0]\n"
+    "fmla v14.4s, v28.4s, v3.s[1]\n"
+    "fmla v15.4s, v28.4s, v3.s[2]\n"
+    "fmla v16.4s, v28.4s, v4.s[3]\n"
+    "fmla v17.4s, v28.4s, v5.s[0]\n"
+    "fmla v18.4s, v28.4s, v5.s[1]\n"
+    "fmla v19.4s, v28.4s, v5.s[2]\n"
+    "ldr q28, [%x[params], #0x80]\n"
+    "fmla v12.4s, v27.4s, v3.s[0]\n"
+    "fmla v13.4s, v27.4s, v3.s[1]\n"
+    "fmla v14.4s, v27.4s, v3.s[2]\n"
+    "fmla v15.4s, v27.4s, v3.s[3]\n"
+    "fmla v16.4s, v27.4s, v5.s[0]\n"
+    "fmla v17.4s, v27.4s, v5.s[1]\n"
+    "fmla v18.4s, v27.4s, v5.s[2]\n"
+    "fmla v19.4s, v27.4s, v5.s[3]\n"
+    "ldr q27, [%x[params], #0x90]\n"
+    "fmla v12.4s, v31.4s, v4.s[0]\n"
+    "fmla v13.4s, v31.4s, v4.s[1]\n"
+    "fmla v14.4s, v31.4s, v4.s[2]\n"
+    "fmla v15.4s, v31.4s, v4.s[3]\n"
+    "fmla v16.4s, v31.4s, v6.s[0]\n"
+    "fmla v17.4s, v31.4s, v6.s[1]\n"
+    "fmla v18.4s, v31.4s, v6.s[2]\n"
+    "fmla v19.4s, v31.4s, v6.s[3]\n"
+    "ldr q31, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v30.4s, v4.s[1]\n"
+    "fmla v13.4s, v30.4s, v4.s[2]\n"
+    "fmla v14.4s, v30.4s, v4.s[3]\n"
+    "fmla v15.4s, v30.4s, v5.s[0]\n"
+    "fmla v16.4s, v30.4s, v6.s[1]\n"
+    "fmla v17.4s, v30.4s, v6.s[2]\n"
+    "fmla v18.4s, v30.4s, v6.s[3]\n"
+    "fmla v19.4s, v30.4s, v7.s[0]\n"
+    "ldr q30, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v29.4s, v4.s[2]\n"
+    "fmla v13.4s, v29.4s, v4.s[3]\n"
+    "fmla v14.4s, v29.4s, v5.s[0]\n"
+    "fmla v15.4s, v29.4s, v5.s[1]\n"
+    "fmla v16.4s, v29.4s, v6.s[2]\n"
+    "fmla v17.4s, v29.4s, v6.s[3]\n"
+    "fmla v18.4s, v29.4s, v7.s[0]\n"
+    "fmla v19.4s, v29.4s, v7.s[1]\n"
+    "ldr q29, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v28.4s, v4.s[3]\n"
+    "fmla v13.4s, v28.4s, v5.s[0]\n"
+    "fmla v14.4s, v28.4s, v5.s[1]\n"
+    "fmla v15.4s, v28.4s, v5.s[2]\n"
+    "fmla v16.4s, v28.4s, v6.s[3]\n"
+    "fmla v17.4s, v28.4s, v7.s[0]\n"
+    "fmla v18.4s, v28.4s, v7.s[1]\n"
+    "fmla v19.4s, v28.4s, v7.s[2]\n"
+    "ldr q28, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v27.4s, v5.s[0]\n"
+    "fmla v13.4s, v27.4s, v5.s[1]\n"
+    "fmla v14.4s, v27.4s, v5.s[2]\n"
+    "fmla v15.4s, v27.4s, v5.s[3]\n"
+    "fmla v16.4s, v27.4s, v7.s[0]\n"
+    "fmla v17.4s, v27.4s, v7.s[1]\n"
+    "fmla v18.4s, v27.4s, v7.s[2]\n"
+    "fmla v19.4s, v27.4s, v7.s[3]\n"
+    "ldr q27, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v31.4s, v6.s[0]\n"
+    "fmla v13.4s, v31.4s, v6.s[1]\n"
+    "fmla v14.4s, v31.4s, v6.s[2]\n"
+    "fmla v15.4s, v31.4s, v6.s[3]\n"
+    "fmla v16.4s, v31.4s, v8.s[0]\n"
+    "fmla v17.4s, v31.4s, v8.s[1]\n"
+    "fmla v18.4s, v31.4s, v8.s[2]\n"
+    "fmla v19.4s, v31.4s, v8.s[3]\n"
+    "ldr q31, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v30.4s, v6.s[1]\n"
+    "fmla v13.4s, v30.4s, v6.s[2]\n"
+    "fmla v14.4s, v30.4s, v6.s[3]\n"
+    "fmla v15.4s, v30.4s, v7.s[0]\n"
+    "fmla v16.4s, v30.4s, v8.s[1]\n"
+    "fmla v17.4s, v30.4s, v8.s[2]\n"
+    "fmla v18.4s, v30.4s, v8.s[3]\n"
+    "fmla v19.4s, v30.4s, v9.s[0]\n"
+    "ldr q30, [%x[params], #0x100]\n"
+    "fmla v12.4s, v29.4s, v6.s[2]\n"
+    "fmla v13.4s, v29.4s, v6.s[3]\n"
+    "fmla v14.4s, v29.4s, v7.s[0]\n"
+    "fmla v15.4s, v29.4s, v7.s[1]\n"
+    "fmla v16.4s, v29.4s, v8.s[2]\n"
+    "fmla v17.4s, v29.4s, v8.s[3]\n"
+    "fmla v18.4s, v29.4s, v9.s[0]\n"
+    "fmla v19.4s, v29.4s, v9.s[1]\n"
+    "ldr q29, [%x[params], #0x110]\n"
+    "fmla v12.4s, v28.4s, v6.s[3]\n"
+    "fmla v13.4s, v28.4s, v7.s[0]\n"
+    "fmla v14.4s, v28.4s, v7.s[1]\n"
+    "fmla v15.4s, v28.4s, v7.s[2]\n"
+    "fmla v16.4s, v28.4s, v8.s[3]\n"
+    "fmla v17.4s, v28.4s, v9.s[0]\n"
+    "fmla v18.4s, v28.4s, v9.s[1]\n"
+    "fmla v19.4s, v28.4s, v9.s[2]\n"
+    "ldr q28, [%x[params], #0x120]\n"
+    "fmla v12.4s, v27.4s, v7.s[0]\n"
+    "fmla v13.4s, v27.4s, v7.s[1]\n"
+    "fmla v14.4s, v27.4s, v7.s[2]\n"
+    "fmla v15.4s, v27.4s, v7.s[3]\n"
+    "fmla v16.4s, v27.4s, v9.s[0]\n"
+    "fmla v17.4s, v27.4s, v9.s[1]\n"
+    "fmla v18.4s, v27.4s, v9.s[2]\n"
+    "fmla v19.4s, v27.4s, v9.s[3]\n"
+    "ldr q27, [%x[params], #0x130]\n"
+    "fmla v12.4s, v31.4s, v8.s[0]\n"
+    "fmla v13.4s, v31.4s, v8.s[1]\n"
+    "fmla v14.4s, v31.4s, v8.s[2]\n"
+    "fmla v15.4s, v31.4s, v8.s[3]\n"
+    "fmla v16.4s, v31.4s, v10.s[0]\n"
+    "fmla v17.4s, v31.4s, v10.s[1]\n"
+    "fmla v18.4s, v31.4s, v10.s[2]\n"
+    "fmla v19.4s, v31.4s, v10.s[3]\n"
+    "ldr q31, [%x[params], #0x150]\n"
+    "fmla v12.4s, v30.4s, v8.s[1]\n"
+    "fmla v13.4s, v30.4s, v8.s[2]\n"
+    "fmla v14.4s, v30.4s, v8.s[3]\n"
+    "fmla v15.4s, v30.4s, v9.s[0]\n"
+    "fmla v16.4s, v30.4s, v10.s[1]\n"
+    "fmla v17.4s, v30.4s, v10.s[2]\n"
+    "fmla v18.4s, v30.4s, v10.s[3]\n"
+    "fmla v19.4s, v30.4s, v11.s[0]\n"
+    "ldr q30, [%x[params], #0x160]\n"
+    "fmla v12.4s, v29.4s, v8.s[2]\n"
+    "fmla v13.4s, v29.4s, v8.s[3]\n"
+    "fmla v14.4s, v29.4s, v9.s[0]\n"
+    "fmla v15.4s, v29.4s, v9.s[1]\n"
+    "fmla v16.4s, v29.4s, v10.s[2]\n"
+    "fmla v17.4s, v29.4s, v10.s[3]\n"
+    "fmla v18.4s, v29.4s, v11.s[0]\n"
+    "fmla v19.4s, v29.4s, v11.s[1]\n"
+    "ldr q29, [%x[params], #0x170]\n"
+    "fmla v12.4s, v28.4s, v8.s[3]\n"
+    "fmla v13.4s, v28.4s, v9.s[0]\n"
+    "fmla v14.4s, v28.4s, v9.s[1]\n"
+    "fmla v15.4s, v28.4s, v9.s[2]\n"
+    "fmla v16.4s, v28.4s, v10.s[3]\n"
+    "fmla v17.4s, v28.4s, v11.s[0]\n"
+    "fmla v18.4s, v28.4s, v11.s[1]\n"
+    "fmla v19.4s, v28.4s, v11.s[2]\n"
+    "ldr q28, [%x[params], #0x180]\n"
+    "fmla v12.4s, v27.4s, v9.s[0]\n"
+    "fmla v13.4s, v27.4s, v9.s[1]\n"
+    "fmla v14.4s, v27.4s, v9.s[2]\n"
+    "fmla v15.4s, v27.4s, v9.s[3]\n"
+    "fmla v16.4s, v27.4s, v11.s[0]\n"
+    "fmla v17.4s, v27.4s, v11.s[1]\n"
+    "fmla v18.4s, v27.4s, v11.s[2]\n"
+    "fmla v19.4s, v27.4s, v11.s[3]\n"
+    "ldr q27, [%x[params], #0x190]\n"
+    "fmin v12.4s, v12.4s, v20.4s\n"
+    "fmin v13.4s, v13.4s, v20.4s\n"
+    "fmin v14.4s, v14.4s, v20.4s\n"
+    "fmax v12.4s, v12.4s, v21.4s\n"
+    "str q12, [x13, x25]\n"
+    "fmax v13.4s, v13.4s, v21.4s\n"
+    "fmax v14.4s, v14.4s, v21.4s\n"
+    "ldr q12, [%x[params], #0x140]\n"
+    "add %x[params], %x[params], #0x1a0\n"
+    "fmin v15.4s, v15.4s, v20.4s\n"
+    "str q13, [x12, x25]\n"
+    "fmin v16.4s, v16.4s, v20.4s\n"
+    "fmin v17.4s, v17.4s, v20.4s\n"
+    "str q14, [x10, x25]\n"
+    "fmin v18.4s, v18.4s, v20.4s\n"
+    "fmax v15.4s, v15.4s, v21.4s\n"
+    "str q15, [x9, x25]\n"
+    "fmax v16.4s, v16.4s, v21.4s\n"
+    "fmax v17.4s, v17.4s, v21.4s\n"
+    "str q16, [x27, x25]\n"
+    "fmax v18.4s, v18.4s, v21.4s\n"
+    "fmin v19.4s, v19.4s, v20.4s\n"
+    "str q17, [x26, x25]\n"
+    "mov v13.16b, v12.16b\n"
+    "str q18, [x24, x25]\n"
+    "fmax v19.4s, v19.4s, v21.4s\n"
+    "mov v14.16b, v12.16b\n"
+    "str q19, [x23, x25]\n"
+    "mov v15.16b, v12.16b\n"
+    "add x25, x25, #0x10\n"
+    "mov v16.16b, v12.16b\n"
+    "mov v17.16b, v12.16b\n"
+    "mov v18.16b, v12.16b\n"
+    "mov v19.16b, v12.16b\n"
+    "bgt 1b\n"
+    "2:"  // Output channel complete vector tail
+    "fmla v12.4s, v31.4s, v0.s[0]\n"
+    "fmla v13.4s, v31.4s, v0.s[1]\n"
+    "fmla v14.4s, v31.4s, v0.s[2]\n"
+    "fmla v15.4s, v31.4s, v0.s[3]\n"
+    "fmla v16.4s, v31.4s, v2.s[0]\n"
+    "fmla v17.4s, v31.4s, v2.s[1]\n"
+    "fmla v18.4s, v31.4s, v2.s[2]\n"
+    "fmla v19.4s, v31.4s, v2.s[3]\n"
+    "ldr q31, [%x[params], #0x0]\n"
+    "fmla v12.4s, v30.4s, v0.s[1]\n"
+    "fmla v13.4s, v30.4s, v0.s[2]\n"
+    "fmla v14.4s, v30.4s, v0.s[3]\n"
+    "fmla v15.4s, v30.4s, v1.s[0]\n"
+    "fmla v16.4s, v30.4s, v2.s[1]\n"
+    "fmla v17.4s, v30.4s, v2.s[2]\n"
+    "fmla v18.4s, v30.4s, v2.s[3]\n"
+    "fmla v19.4s, v30.4s, v3.s[0]\n"
+    "ldr q30, [%x[params], #0x10]\n"
+    "fmla v12.4s, v29.4s, v0.s[2]\n"
+    "fmla v13.4s, v29.4s, v0.s[3]\n"
+    "fmla v14.4s, v29.4s, v1.s[0]\n"
+    "fmla v15.4s, v29.4s, v1.s[1]\n"
+    "fmla v16.4s, v29.4s, v2.s[2]\n"
+    "fmla v17.4s, v29.4s, v2.s[3]\n"
+    "fmla v18.4s, v29.4s, v3.s[0]\n"
+    "fmla v19.4s, v29.4s, v3.s[1]\n"
+    "ldr q29, [%x[params], #0x20]\n"
+    "fmla v12.4s, v28.4s, v0.s[3]\n"
+    "fmla v13.4s, v28.4s, v1.s[0]\n"
+    "fmla v14.4s, v28.4s, v1.s[1]\n"
+    "fmla v15.4s, v28.4s, v1.s[2]\n"
+    "fmla v16.4s, v28.4s, v2.s[3]\n"
+    "fmla v17.4s, v28.4s, v3.s[0]\n"
+    "fmla v18.4s, v28.4s, v3.s[1]\n"
+    "fmla v19.4s, v28.4s, v3.s[2]\n"
+    "ldr q28, [%x[params], #0x30]\n"
+    "fmla v12.4s, v27.4s, v1.s[0]\n"
+    "fmla v13.4s, v27.4s, v1.s[1]\n"
+    "fmla v14.4s, v27.4s, v1.s[2]\n"
+    "fmla v15.4s, v27.4s, v1.s[3]\n"
+    "fmla v16.4s, v27.4s, v3.s[0]\n"
+    "fmla v17.4s, v27.4s, v3.s[1]\n"
+    "fmla v18.4s, v27.4s, v3.s[2]\n"
+    "fmla v19.4s, v27.4s, v3.s[3]\n"
+    "ldr q27, [%x[params], #0x40]\n"
+    "fmla v12.4s, v31.4s, v2.s[0]\n"
+    "fmla v13.4s, v31.4s, v2.s[1]\n"
+    "fmla v14.4s, v31.4s, v2.s[2]\n"
+    "fmla v15.4s, v31.4s, v2.s[3]\n"
+    "fmla v16.4s, v31.4s, v4.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[1]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v4.s[3]\n"
+    "ldr q31, [%x[params], #0x50]\n"
+    "fmla v12.4s, v30.4s, v2.s[1]\n"
+    "fmla v13.4s, v30.4s, v2.s[2]\n"
+    "fmla v14.4s, v30.4s, v2.s[3]\n"
+    "fmla v15.4s, v30.4s, v3.s[0]\n"
+    "fmla v16.4s, v30.4s, v4.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[2]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[0]\n"
+    "ldr q30, [%x[params], #0x60]\n"
+    "fmla v12.4s, v29.4s, v2.s[2]\n"
+    "fmla v13.4s, v29.4s, v2.s[3]\n"
+    "fmla v14.4s, v29.4s, v3.s[0]\n"
+    "fmla v15.4s, v29.4s, v3.s[1]\n"
+    "fmla v16.4s, v29.4s, v4.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[3]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[1]\n"
+    "ldr q29, [%x[params], #0x70]\n"
+    "fmla v12.4s, v28.4s, v2.s[3]\n"
+    "fmla v13.4s, v28.4s, v3.s[0]\n"
+    "fmla v14.4s, v28.4s, v3.s[1]\n"
+    "fmla v15.4s, v28.4s, v3.s[2]\n"
+    "fmla v16.4s, v28.4s, v4.s[3]\n"
+    "fmla v17.4s, v28.4s, v5.s[0]\n"
+    "fmla v18.4s, v28.4s, v5.s[1]\n"
+    "fmla v19.4s, v28.4s, v5.s[2]\n"
+    "ldr q28, [%x[params], #0x80]\n"
+    "fmla v12.4s, v27.4s, v3.s[0]\n"
+    "fmla v13.4s, v27.4s, v3.s[1]\n"
+    "fmla v14.4s, v27.4s, v3.s[2]\n"
+    "fmla v15.4s, v27.4s, v3.s[3]\n"
+    "fmla v16.4s, v27.4s, v5.s[0]\n"
+    "fmla v17.4s, v27.4s, v5.s[1]\n"
+    "fmla v18.4s, v27.4s, v5.s[2]\n"
+    "fmla v19.4s, v27.4s, v5.s[3]\n"
+    "ldr q27, [%x[params], #0x90]\n"
+    "fmla v12.4s, v31.4s, v4.s[0]\n"
+    "fmla v13.4s, v31.4s, v4.s[1]\n"
+    "fmla v14.4s, v31.4s, v4.s[2]\n"
+    "fmla v15.4s, v31.4s, v4.s[3]\n"
+    "fmla v16.4s, v31.4s, v6.s[0]\n"
+    "fmla v17.4s, v31.4s, v6.s[1]\n"
+    "fmla v18.4s, v31.4s, v6.s[2]\n"
+    "fmla v19.4s, v31.4s, v6.s[3]\n"
+    "ldr q31, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v30.4s, v4.s[1]\n"
+    "fmla v13.4s, v30.4s, v4.s[2]\n"
+    "fmla v14.4s, v30.4s, v4.s[3]\n"
+    "fmla v15.4s, v30.4s, v5.s[0]\n"
+    "fmla v16.4s, v30.4s, v6.s[1]\n"
+    "fmla v17.4s, v30.4s, v6.s[2]\n"
+    "fmla v18.4s, v30.4s, v6.s[3]\n"
+    "fmla v19.4s, v30.4s, v7.s[0]\n"
+    "ldr q30, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v29.4s, v4.s[2]\n"
+    "fmla v13.4s, v29.4s, v4.s[3]\n"
+    "fmla v14.4s, v29.4s, v5.s[0]\n"
+    "fmla v15.4s, v29.4s, v5.s[1]\n"
+    "fmla v16.4s, v29.4s, v6.s[2]\n"
+    "fmla v17.4s, v29.4s, v6.s[3]\n"
+    "fmla v18.4s, v29.4s, v7.s[0]\n"
+    "fmla v19.4s, v29.4s, v7.s[1]\n"
+    "ldr q29, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v28.4s, v4.s[3]\n"
+    "fmla v13.4s, v28.4s, v5.s[0]\n"
+    "fmla v14.4s, v28.4s, v5.s[1]\n"
+    "fmla v15.4s, v28.4s, v5.s[2]\n"
+    "fmla v16.4s, v28.4s, v6.s[3]\n"
+    "fmla v17.4s, v28.4s, v7.s[0]\n"
+    "fmla v18.4s, v28.4s, v7.s[1]\n"
+    "fmla v19.4s, v28.4s, v7.s[2]\n"
+    "ldr q28, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v27.4s, v5.s[0]\n"
+    "fmla v13.4s, v27.4s, v5.s[1]\n"
+    "fmla v14.4s, v27.4s, v5.s[2]\n"
+    "fmla v15.4s, v27.4s, v5.s[3]\n"
+    "fmla v16.4s, v27.4s, v7.s[0]\n"
+    "fmla v17.4s, v27.4s, v7.s[1]\n"
+    "fmla v18.4s, v27.4s, v7.s[2]\n"
+    "fmla v19.4s, v27.4s, v7.s[3]\n"
+    "ldr q27, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v31.4s, v6.s[0]\n"
+    "fmla v13.4s, v31.4s, v6.s[1]\n"
+    "fmla v14.4s, v31.4s, v6.s[2]\n"
+    "fmla v15.4s, v31.4s, v6.s[3]\n"
+    "fmla v16.4s, v31.4s, v8.s[0]\n"
+    "fmla v17.4s, v31.4s, v8.s[1]\n"
+    "fmla v18.4s, v31.4s, v8.s[2]\n"
+    "fmla v19.4s, v31.4s, v8.s[3]\n"
+    "ldr q31, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v30.4s, v6.s[1]\n"
+    "fmla v13.4s, v30.4s, v6.s[2]\n"
+    "fmla v14.4s, v30.4s, v6.s[3]\n"
+    "fmla v15.4s, v30.4s, v7.s[0]\n"
+    "fmla v16.4s, v30.4s, v8.s[1]\n"
+    "fmla v17.4s, v30.4s, v8.s[2]\n"
+    "fmla v18.4s, v30.4s, v8.s[3]\n"
+    "fmla v19.4s, v30.4s, v9.s[0]\n"
+    "ldr q30, [%x[params], #0x100]\n"
+    "fmla v12.4s, v29.4s, v6.s[2]\n"
+    "fmla v13.4s, v29.4s, v6.s[3]\n"
+    "fmla v14.4s, v29.4s, v7.s[0]\n"
+    "fmla v15.4s, v29.4s, v7.s[1]\n"
+    "fmla v16.4s, v29.4s, v8.s[2]\n"
+    "fmla v17.4s, v29.4s, v8.s[3]\n"
+    "fmla v18.4s, v29.4s, v9.s[0]\n"
+    "fmla v19.4s, v29.4s, v9.s[1]\n"
+    "ldr q29, [%x[params], #0x110]\n"
+    "fmla v12.4s, v28.4s, v6.s[3]\n"
+    "fmla v13.4s, v28.4s, v7.s[0]\n"
+    "fmla v14.4s, v28.4s, v7.s[1]\n"
+    "fmla v15.4s, v28.4s, v7.s[2]\n"
+    "fmla v16.4s, v28.4s, v8.s[3]\n"
+    "fmla v17.4s, v28.4s, v9.s[0]\n"
+    "fmla v18.4s, v28.4s, v9.s[1]\n"
+    "fmla v19.4s, v28.4s, v9.s[2]\n"
+    "ldr q28, [%x[params], #0x120]\n"
+    "fmla v12.4s, v27.4s, v7.s[0]\n"
+    "fmla v13.4s, v27.4s, v7.s[1]\n"
+    "fmla v14.4s, v27.4s, v7.s[2]\n"
+    "fmla v15.4s, v27.4s, v7.s[3]\n"
+    "fmla v16.4s, v27.4s, v9.s[0]\n"
+    "fmla v17.4s, v27.4s, v9.s[1]\n"
+    "fmla v18.4s, v27.4s, v9.s[2]\n"
+    "fmla v19.4s, v27.4s, v9.s[3]\n"
+    "ldr q27, [%x[params], #0x130]\n"
+    "add %x[params], %x[params], #0x140\n"
+    "fmla v12.4s, v31.4s, v8.s[0]\n"
+    "fmla v13.4s, v31.4s, v8.s[1]\n"
+    "fmla v14.4s, v31.4s, v8.s[2]\n"
+    "fmla v15.4s, v31.4s, v8.s[3]\n"
+    "fmla v16.4s, v31.4s, v10.s[0]\n"
+    "fmla v17.4s, v31.4s, v10.s[1]\n"
+    "fmla v18.4s, v31.4s, v10.s[2]\n"
+    "fmla v19.4s, v31.4s, v10.s[3]\n"
+    "fmla v12.4s, v30.4s, v8.s[1]\n"
+    "fmla v13.4s, v30.4s, v8.s[2]\n"
+    "fmla v14.4s, v30.4s, v8.s[3]\n"
+    "fmla v15.4s, v30.4s, v9.s[0]\n"
+    "fmla v16.4s, v30.4s, v10.s[1]\n"
+    "fmla v17.4s, v30.4s, v10.s[2]\n"
+    "fmla v18.4s, v30.4s, v10.s[3]\n"
+    "fmla v19.4s, v30.4s, v11.s[0]\n"
+    "fmla v12.4s, v29.4s, v8.s[2]\n"
+    "fmla v13.4s, v29.4s, v8.s[3]\n"
+    "fmla v14.4s, v29.4s, v9.s[0]\n"
+    "fmla v15.4s, v29.4s, v9.s[1]\n"
+    "fmla v16.4s, v29.4s, v10.s[2]\n"
+    "fmla v17.4s, v29.4s, v10.s[3]\n"
+    "fmla v18.4s, v29.4s, v11.s[0]\n"
+    "fmla v19.4s, v29.4s, v11.s[1]\n"
+    "fmla v12.4s, v28.4s, v8.s[3]\n"
+    "fmla v13.4s, v28.4s, v9.s[0]\n"
+    "fmla v14.4s, v28.4s, v9.s[1]\n"
+    "fmla v15.4s, v28.4s, v9.s[2]\n"
+    "fmla v16.4s, v28.4s, v10.s[3]\n"
+    "fmla v17.4s, v28.4s, v11.s[0]\n"
+    "fmla v18.4s, v28.4s, v11.s[1]\n"
+    "fmla v19.4s, v28.4s, v11.s[2]\n"
+    "fmla v12.4s, v27.4s, v9.s[0]\n"
+    "fmla v13.4s, v27.4s, v9.s[1]\n"
+    "fmla v14.4s, v27.4s, v9.s[2]\n"
+    "fmla v15.4s, v27.4s, v9.s[3]\n"
+    "fmla v16.4s, v27.4s, v11.s[0]\n"
+    "fmla v17.4s, v27.4s, v11.s[1]\n"
+    "fmla v18.4s, v27.4s, v11.s[2]\n"
+    "fmla v19.4s, v27.4s, v11.s[3]\n"
+    "fmin v12.4s, v12.4s, v20.4s\n"
+    "fmin v13.4s, v13.4s, v20.4s\n"
+    "fmin v14.4s, v14.4s, v20.4s\n"
+    "fmax v12.4s, v12.4s, v21.4s\n"
+    "str q12, [x13, x25]\n"
+    "fmax v13.4s, v13.4s, v21.4s\n"
+    "fmax v14.4s, v14.4s, v21.4s\n"
+    "str q13, [x12, x25]\n"
+    "fmin v15.4s, v15.4s, v20.4s\n"
+    "fmin v16.4s, v16.4s, v20.4s\n"
+    "str q14, [x10, x25]\n"
+    "fmin v17.4s, v17.4s, v20.4s\n"
+    "fmin v18.4s, v18.4s, v20.4s\n"
+    "fmax v15.4s, v15.4s, v21.4s\n"
+    "str q15, [x9, x25]\n"
+    "fmax v16.4s, v16.4s, v21.4s\n"
+    "fmax v17.4s, v17.4s, v21.4s\n"
+    "str q16, [x27, x25]\n"
+    "fmax v18.4s, v18.4s, v21.4s\n"
+    "fmin v19.4s, v19.4s, v20.4s\n"
+    "str q17, [x26, x25]\n"
+    "fmax v19.4s, v19.4s, v21.4s\n"
+    "str q18, [x24, x25]\n"
+    "str q19, [x23, x25]\n"
+    "add x25, x25, #0x10\n"
+    "3:"  // Output channel oddments
+    "tst %x[channel_multiplier], #0x3\n"
+    "beq 6f\n"
+    "ldr q12, [%x[params], #0x0]\n"
+    "mov v13.16b, v12.16b\n"
+    "ldr q31, [%x[params], #0x10]\n"
+    "mov v14.16b, v12.16b\n"
+    "ldr q30, [%x[params], #0x20]\n"
+    "mov v15.16b, v12.16b\n"
+    "ldr q29, [%x[params], #0x30]\n"
+    "mov v16.16b, v12.16b\n"
+    "ldr q28, [%x[params], #0x40]\n"
+    "mov v17.16b, v12.16b\n"
+    "ldr q27, [%x[params], #0x50]\n"
+    "mov v18.16b, v12.16b\n"
+    "mov v19.16b, v12.16b\n"
+    "fmla v12.4s, v31.4s, v0.s[0]\n"
+    "fmla v13.4s, v31.4s, v0.s[1]\n"
+    "fmla v14.4s, v31.4s, v0.s[2]\n"
+    "fmla v15.4s, v31.4s, v0.s[3]\n"
+    "fmla v16.4s, v31.4s, v2.s[0]\n"
+    "fmla v17.4s, v31.4s, v2.s[1]\n"
+    "fmla v18.4s, v31.4s, v2.s[2]\n"
+    "fmla v19.4s, v31.4s, v2.s[3]\n"
+    "ldr q31, [%x[params], #0x60]\n"
+    "fmla v12.4s, v30.4s, v0.s[1]\n"
+    "fmla v13.4s, v30.4s, v0.s[2]\n"
+    "fmla v14.4s, v30.4s, v0.s[3]\n"
+    "fmla v15.4s, v30.4s, v1.s[0]\n"
+    "fmla v16.4s, v30.4s, v2.s[1]\n"
+    "fmla v17.4s, v30.4s, v2.s[2]\n"
+    "fmla v18.4s, v30.4s, v2.s[3]\n"
+    "fmla v19.4s, v30.4s, v3.s[0]\n"
+    "ldr q30, [%x[params], #0x70]\n"
+    "fmla v12.4s, v29.4s, v0.s[2]\n"
+    "fmla v13.4s, v29.4s, v0.s[3]\n"
+    "fmla v14.4s, v29.4s, v1.s[0]\n"
+    "fmla v15.4s, v29.4s, v1.s[1]\n"
+    "fmla v16.4s, v29.4s, v2.s[2]\n"
+    "fmla v17.4s, v29.4s, v2.s[3]\n"
+    "fmla v18.4s, v29.4s, v3.s[0]\n"
+    "fmla v19.4s, v29.4s, v3.s[1]\n"
+    "ldr q29, [%x[params], #0x80]\n"
+    "fmla v12.4s, v28.4s, v0.s[3]\n"
+    "fmla v13.4s, v28.4s, v1.s[0]\n"
+    "fmla v14.4s, v28.4s, v1.s[1]\n"
+    "fmla v15.4s, v28.4s, v1.s[2]\n"
+    "fmla v16.4s, v28.4s, v2.s[3]\n"
+    "fmla v17.4s, v28.4s, v3.s[0]\n"
+    "fmla v18.4s, v28.4s, v3.s[1]\n"
+    "fmla v19.4s, v28.4s, v3.s[2]\n"
+    "ldr q28, [%x[params], #0x90]\n"
+    "fmla v12.4s, v27.4s, v1.s[0]\n"
+    "fmla v13.4s, v27.4s, v1.s[1]\n"
+    "fmla v14.4s, v27.4s, v1.s[2]\n"
+    "fmla v15.4s, v27.4s, v1.s[3]\n"
+    "fmla v16.4s, v27.4s, v3.s[0]\n"
+    "fmla v17.4s, v27.4s, v3.s[1]\n"
+    "fmla v18.4s, v27.4s, v3.s[2]\n"
+    "fmla v19.4s, v27.4s, v3.s[3]\n"
+    "ldr q27, [%x[params], #0xa0]\n"
+    "fmla v12.4s, v31.4s, v2.s[0]\n"
+    "fmla v13.4s, v31.4s, v2.s[1]\n"
+    "fmla v14.4s, v31.4s, v2.s[2]\n"
+    "fmla v15.4s, v31.4s, v2.s[3]\n"
+    "fmla v16.4s, v31.4s, v4.s[0]\n"
+    "fmla v17.4s, v31.4s, v4.s[1]\n"
+    "fmla v18.4s, v31.4s, v4.s[2]\n"
+    "fmla v19.4s, v31.4s, v4.s[3]\n"
+    "ldr q31, [%x[params], #0xb0]\n"
+    "fmla v12.4s, v30.4s, v2.s[1]\n"
+    "fmla v13.4s, v30.4s, v2.s[2]\n"
+    "fmla v14.4s, v30.4s, v2.s[3]\n"
+    "fmla v15.4s, v30.4s, v3.s[0]\n"
+    "fmla v16.4s, v30.4s, v4.s[1]\n"
+    "fmla v17.4s, v30.4s, v4.s[2]\n"
+    "fmla v18.4s, v30.4s, v4.s[3]\n"
+    "fmla v19.4s, v30.4s, v5.s[0]\n"
+    "ldr q30, [%x[params], #0xc0]\n"
+    "fmla v12.4s, v29.4s, v2.s[2]\n"
+    "fmla v13.4s, v29.4s, v2.s[3]\n"
+    "fmla v14.4s, v29.4s, v3.s[0]\n"
+    "fmla v15.4s, v29.4s, v3.s[1]\n"
+    "fmla v16.4s, v29.4s, v4.s[2]\n"
+    "fmla v17.4s, v29.4s, v4.s[3]\n"
+    "fmla v18.4s, v29.4s, v5.s[0]\n"
+    "fmla v19.4s, v29.4s, v5.s[1]\n"
+    "ldr q29, [%x[params], #0xd0]\n"
+    "fmla v12.4s, v28.4s, v2.s[3]\n"
+    "fmla v13.4s, v28.4s, v3.s[0]\n"
+    "fmla v14.4s, v28.4s, v3.s[1]\n"
+    "fmla v15.4s, v28.4s, v3.s[2]\n"
+    "fmla v16.4s, v28.4s, v4.s[3]\n"
+    "fmla v17.4s, v28.4s, v5.s[0]\n"
+    "fmla v18.4s, v28.4s, v5.s[1]\n"
+    "fmla v19.4s, v28.4s, v5.s[2]\n"
+    "ldr q28, [%x[params], #0xe0]\n"
+    "fmla v12.4s, v27.4s, v3.s[0]\n"
+    "fmla v13.4s, v27.4s, v3.s[1]\n"
+    "fmla v14.4s, v27.4s, v3.s[2]\n"
+    "fmla v15.4s, v27.4s, v3.s[3]\n"
+    "fmla v16.4s, v27.4s, v5.s[0]\n"
+    "fmla v17.4s, v27.4s, v5.s[1]\n"
+    "fmla v18.4s, v27.4s, v5.s[2]\n"
+    "fmla v19.4s, v27.4s, v5.s[3]\n"
+    "ldr q27, [%x[params], #0xf0]\n"
+    "fmla v12.4s, v31.4s, v4.s[0]\n"
+    "fmla v13.4s, v31.4s, v4.s[1]\n"
+    "fmla v14.4s, v31.4s, v4.s[2]\n"
+    "fmla v15.4s, v31.4s, v4.s[3]\n"
+    "fmla v16.4s, v31.4s, v6.s[0]\n"
+    "fmla v17.4s, v31.4s, v6.s[1]\n"
+    "fmla v18.4s, v31.4s, v6.s[2]\n"
+    "fmla v19.4s, v31.4s, v6.s[3]\n"
+    "ldr q31, [%x[params], #0x100]\n"
+    "fmla v12.4s, v30.4s, v4.s[1]\n"
+    "fmla v13.4s, v30.4s, v4.s[2]\n"
+    "fmla v14.4s, v30.4s, v4.s[3]\n"
+    "fmla v15.4s, v30.4s, v5.s[0]\n"
+    "fmla v16.4s, v30.4s, v6.s[1]\n"
+    "fmla v17.4s, v30.4s, v6.s[2]\n"
+    "fmla v18.4s, v30.4s, v6.s[3]\n"
+    "fmla v19.4s, v30.4s, v7.s[0]\n"
+    "ldr q30, [%x[params], #0x110]\n"
+    "fmla v12.4s, v29.4s, v4.s[2]\n"
+    "fmla v13.4s, v29.4s, v4.s[3]\n"
+    "fmla v14.4s, v29.4s, v5.s[0]\n"
+    "fmla v15.4s, v29.4s, v5.s[1]\n"
+    "fmla v16.4s, v29.4s, v6.s[2]\n"
+    "fmla v17.4s, v29.4s, v6.s[3]\n"
+    "fmla v18.4s, v29.4s, v7.s[0]\n"
+    "fmla v19.4s, v29.4s, v7.s[1]\n"
+    "ldr q29, [%x[params], #0x120]\n"
+    "fmla v12.4s, v28.4s, v4.s[3]\n"
+    "fmla v13.4s, v28.4s, v5.s[0]\n"
+    "fmla v14.4s, v28.4s, v5.s[1]\n"
+    "fmla v15.4s, v28.4s, v5.s[2]\n"
+    "fmla v16.4s, v28.4s, v6.s[3]\n"
+    "fmla v17.4s, v28.4s, v7.s[0]\n"
+    "fmla v18.4s, v28.4s, v7.s[1]\n"
+    "fmla v19.4s, v28.4s, v7.s[2]\n"
+    "ldr q28, [%x[params], #0x130]\n"
+    "fmla v12.4s, v27.4s, v5.s[0]\n"
+    "fmla v13.4s, v27.4s, v5.s[1]\n"
+    "fmla v14.4s, v27.4s, v5.s[2]\n"
+    "fmla v15.4s, v27.4s, v5.s[3]\n"
+    "fmla v16.4s, v27.4s, v7.s[0]\n"
+    "fmla v17.4s, v27.4s, v7.s[1]\n"
+    "fmla v18.4s, v27.4s, v7.s[2]\n"
+    "fmla v19.4s, v27.4s, v7.s[3]\n"
+    "ldr q27, [%x[params], #0x140]\n"
+    "fmla v12.4s, v31.4s, v6.s[0]\n"
+    "fmla v13.4s, v31.4s, v6.s[1]\n"
+    "fmla v14.4s, v31.4s, v6.s[2]\n"
+    "fmla v15.4s, v31.4s, v6.s[3]\n"
+    "fmla v16.4s, v31.4s, v8.s[0]\n"
+    "fmla v17.4s, v31.4s, v8.s[1]\n"
+    "fmla v18.4s, v31.4s, v8.s[2]\n"
+    "fmla v19.4s, v31.4s, v8.s[3]\n"
+    "ldr q31, [%x[params], #0x150]\n"
+    "fmla v12.4s, v30.4s, v6.s[1]\n"
+    "fmla v13.4s, v30.4s, v6.s[2]\n"
+    "fmla v14.4s, v30.4s, v6.s[3]\n"
+    "fmla v15.4s, v30.4s, v7.s[0]\n"
+    "fmla v16.4s, v30.4s, v8.s[1]\n"
+    "fmla v17.4s, v30.4s, v8.s[2]\n"
+    "fmla v18.4s, v30.4s, v8.s[3]\n"
+    "fmla v19.4s, v30.4s, v9.s[0]\n"
+    "ldr q30, [%x[params], #0x160]\n"
+    "fmla v12.4s, v29.4s, v6.s[2]\n"
+    "fmla v13.4s, v29.4s, v6.s[3]\n"
+    "fmla v14.4s, v29.4s, v7.s[0]\n"
+    "fmla v15.4s, v29.4s, v7.s[1]\n"
+    "fmla v16.4s, v29.4s, v8.s[2]\n"
+    "fmla v17.4s, v29.4s, v8.s[3]\n"
+    "fmla v18.4s, v29.4s, v9.s[0]\n"
+    "fmla v19.4s, v29.4s, v9.s[1]\n"
+    "ldr q29, [%x[params], #0x170]\n"
+    "fmla v12.4s, v28.4s, v6.s[3]\n"
+    "fmla v13.4s, v28.4s, v7.s[0]\n"
+    "fmla v14.4s, v28.4s, v7.s[1]\n"
+    "fmla v15.4s, v28.4s, v7.s[2]\n"
+    "fmla v16.4s, v28.4s, v8.s[3]\n"
+    "fmla v17.4s, v28.4s, v9.s[0]\n"
+    "fmla v18.4s, v28.4s, v9.s[1]\n"
+    "fmla v19.4s, v28.4s, v9.s[2]\n"
+    "ldr q28, [%x[params], #0x180]\n"
+    "fmla v12.4s, v27.4s, v7.s[0]\n"
+    "fmla v13.4s, v27.4s, v7.s[1]\n"
+    "fmla v14.4s, v27.4s, v7.s[2]\n"
+    "fmla v15.4s, v27.4s, v7.s[3]\n"
+    "fmla v16.4s, v27.4s, v9.s[0]\n"
+    "fmla v17.4s, v27.4s, v9.s[1]\n"
+    "fmla v18.4s, v27.4s, v9.s[2]\n"
+    "fmla v19.4s, v27.4s, v9.s[3]\n"
+    "ldr q27, [%x[params], #0x190]\n"
+    "add %x[params], %x[params], #0x1a0\n"
+    "fmla v12.4s, v31.4s, v8.s[0]\n"
+    "fmla v13.4s, v31.4s, v8.s[1]\n"
+    "fmla v14.4s, v31.4s, v8.s[2]\n"
+    "fmla v15.4s, v31.4s, v8.s[3]\n"
+    "fmla v16.4s, v31.4s, v10.s[0]\n"
+    "fmla v17.4s, v31.4s, v10.s[1]\n"
+    "fmla v18.4s, v31.4s, v10.s[2]\n"
+    "fmla v19.4s, v31.4s, v10.s[3]\n"
+    "fmla v12.4s, v30.4s, v8.s[1]\n"
+    "fmla v13.4s, v30.4s, v8.s[2]\n"
+    "fmla v14.4s, v30.4s, v8.s[3]\n"
+    "fmla v15.4s, v30.4s, v9.s[0]\n"
+    "fmla v16.4s, v30.4s, v10.s[1]\n"
+    "fmla v17.4s, v30.4s, v10.s[2]\n"
+    "fmla v18.4s, v30.4s, v10.s[3]\n"
+    "fmla v19.4s, v30.4s, v11.s[0]\n"
+    "fmla v12.4s, v29.4s, v8.s[2]\n"
+    "fmla v13.4s, v29.4s, v8.s[3]\n"
+    "fmla v14.4s, v29.4s, v9.s[0]\n"
+    "fmla v15.4s, v29.4s, v9.s[1]\n"
+    "fmla v16.4s, v29.4s, v10.s[2]\n"
+    "fmla v17.4s, v29.4s, v10.s[3]\n"
+    "fmla v18.4s, v29.4s, v11.s[0]\n"
+    "fmla v19.4s, v29.4s, v11.s[1]\n"
+    "fmla v12.4s, v28.4s, v8.s[3]\n"
+    "fmla v13.4s, v28.4s, v9.s[0]\n"
+    "fmla v14.4s, v28.4s, v9.s[1]\n"
+    "fmla v15.4s, v28.4s, v9.s[2]\n"
+    "fmla v16.4s, v28.4s, v10.s[3]\n"
+    "fmla v17.4s, v28.4s, v11.s[0]\n"
+    "fmla v18.4s, v28.4s, v11.s[1]\n"
+    "fmla v19.4s, v28.4s, v11.s[2]\n"
+    "fmla v12.4s, v27.4s, v9.s[0]\n"
+    "fmla v13.4s, v27.4s, v9.s[1]\n"
+    "fmla v14.4s, v27.4s, v9.s[2]\n"
+    "fmla v15.4s, v27.4s, v9.s[3]\n"
+    "fmla v16.4s, v27.4s, v11.s[0]\n"
+    "fmla v17.4s, v27.4s, v11.s[1]\n"
+    "fmla v18.4s, v27.4s, v11.s[2]\n"
+    "fmla v19.4s, v27.4s, v11.s[3]\n"
+    "fmin v12.4s, v12.4s, v20.4s\n"
+    "fmin v13.4s, v13.4s, v20.4s\n"
+    "fmin v14.4s, v14.4s, v20.4s\n"
+    "fmax v12.4s, v12.4s, v21.4s\n"
+    "fmax v13.4s, v13.4s, v21.4s\n"
+    "fmax v14.4s, v14.4s, v21.4s\n"
+    "fmin v15.4s, v15.4s, v20.4s\n"
+    "fmin v16.4s, v16.4s, v20.4s\n"
+    "fmin v17.4s, v17.4s, v20.4s\n"
+    "fmax v15.4s, v15.4s, v21.4s\n"
+    "fmax v16.4s, v16.4s, v21.4s\n"
+    "fmax v17.4s, v17.4s, v21.4s\n"
+    "fmin v18.4s, v18.4s, v20.4s\n"
+    "fmin v19.4s, v19.4s, v20.4s\n"
+    "fmax v18.4s, v18.4s, v21.4s\n"
+    "fmax v19.4s, v19.4s, v21.4s\n"
+    "tbz %x[channel_multiplier], #1, 4f\n"
+    "add x19, x13, x25\n"
+    "st1 { v12.d }[0], [x19]\n"
+    "add x19, x12, x25\n"
+    "st1 { v13.d }[0], [x19]\n"
+    "add x19, x10, x25\n"
+    "st1 { v14.d }[0], [x19]\n"
+    "add x19, x9, x25\n"
+    "st1 { v15.d }[0], [x19]\n"
+    "add x19, x27, x25\n"
+    "st1 { v16.d }[0], [x19]\n"
+    "add x19, x26, x25\n"
+    "st1 { v17.d }[0], [x19]\n"
+    "add x19, x24, x25\n"
+    "st1 { v18.d }[0], [x19]\n"
+    "add x19, x23, x25\n"
+    "st1 { v19.d }[0], [x19]\n"
+    "add x25, x25, #0x8\n"
+    "tbz %x[channel_multiplier], #0, 5f\n"
+    "add x19, x13, x25\n"
+    "st1 { v12.s }[2], [x19]\n"
+    "add x19, x12, x25\n"
+    "st1 { v13.s }[2], [x19]\n"
+    "add x19, x10, x25\n"
+    "st1 { v14.s }[2], [x19]\n"
+    "add x19, x9, x25\n"
+    "st1 { v15.s }[2], [x19]\n"
+    "add x19, x27, x25\n"
+    "st1 { v16.s }[2], [x19]\n"
+    "add x19, x26, x25\n"
+    "st1 { v17.s }[2], [x19]\n"
+    "add x19, x24, x25\n"
+    "st1 { v18.s }[2], [x19]\n"
+    "add x19, x23, x25\n"
+    "st1 { v19.s }[2], [x19]\n"
+    "b 5f\n"
+    "4:"  // Output channel oddments: Store: Bit 1: Unset
+    "tbz %x[channel_multiplier], #0, 5f\n"
+    "add x19, x13, x25\n"
+    "st1 { v12.s }[0], [x19]\n"
+    "add x19, x12, x25\n"
+    "st1 { v13.s }[0], [x19]\n"
+    "add x19, x10, x25\n"
+    "st1 { v14.s }[0], [x19]\n"
+    "add x19, x9, x25\n"
+    "st1 { v15.s }[0], [x19]\n"
+    "add x19, x27, x25\n"
+    "st1 { v16.s }[0], [x19]\n"
+    "add x19, x26, x25\n"
+    "st1 { v17.s }[0], [x19]\n"
+    "add x19, x24, x25\n"
+    "st1 { v18.s }[0], [x19]\n"
+    "add x19, x23, x25\n"
+    "st1 { v19.s }[0], [x19]\n"
+    "5:"  // Output channel oddments: Store: Bit 1: End
+
+    "6:"  // End
+
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000..2cc2f7c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int output_rows(void) { return 2; };
+  constexpr static unsigned int output_cols(void) { return 8; };
+
+  constexpr static unsigned int output_col_regs(void) { return 2; };
+
+  kern_type kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+  a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..c93037d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp

@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const float *weights,
+  const float *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ld1r { v11.4s }, [%x[minmax_vals]]\n"
+    "mov x10, #0x0\n"
+    "add x19, %x[minmax_vals], #0x4\n"
+    "ld1r { v10.4s }, [x19]\n"
+    "lsr x9, %x[n_output_channels], #0x2\n"
+    "cbz x9, 8f\n"
+    "1:"  // Output channel loop
+    "movi v16.16b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x19, x10, #0x2\n"
+    "ldr q16, [%x[bias], x19]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v9.16b, v16.16b\n"
+    "ldr q8, [%x[weights], #0x0]\n"
+    "mov x19, %x[inptrs]\n"
+    "mov v7.16b, v16.16b\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "mov v6.16b, v16.16b\n"
+    "ldr q5, [x24, #0x0]\n"
+    "mov v4.16b, v16.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v3.16b, v16.16b\n"
+    "ldr q2, [x24, #0x10]\n"
+    "mov v1.16b, v16.16b\n"
+    "ldr q0, [x28, #0x0]\n"
+    "mov v31.16b, v16.16b\n"
+    "ldr q30, [x28, #0x10]\n"
+    "mov v29.16b, v16.16b\n"
+    "mov v28.16b, v16.16b\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "cbz x20, 6f\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "ldr q20, [%x[weights], #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q19, [x24, #0x0]\n"
+    "ldr q18, [x24, #0x10]\n"
+    "ldr q17, [x28, #0x0]\n"
+    "ldr q16, [x28, #0x10]\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "ldr q5, [x24, #0x0]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "ldr q2, [x24, #0x10]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "ldr q0, [x28, #0x0]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "ldr q30, [x28, #0x10]\n"
+    "fmla v9.4s, v20.4s, v19.s[0]\n"
+    "ldr q8, [%x[weights], #0x0]\n"
+    "fmla v7.4s, v20.4s, v19.s[1]\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "fmla v6.4s, v20.4s, v19.s[2]\n"
+    "fmla v4.4s, v20.4s, v19.s[3]\n"
+    "ldr q19, [x24, #0x0]\n"
+    "fmla v3.4s, v20.4s, v18.s[0]\n"
+    "fmla v1.4s, v20.4s, v18.s[1]\n"
+    "fmla v31.4s, v20.4s, v18.s[2]\n"
+    "fmla v29.4s, v20.4s, v18.s[3]\n"
+    "ldr q18, [x24, #0x10]\n"
+    "fmla v28.4s, v20.4s, v17.s[0]\n"
+    "fmla v27.4s, v20.4s, v17.s[1]\n"
+    "fmla v26.4s, v20.4s, v17.s[2]\n"
+    "fmla v25.4s, v20.4s, v17.s[3]\n"
+    "ldr q17, [x28, #0x0]\n"
+    "fmla v24.4s, v20.4s, v16.s[0]\n"
+    "fmla v23.4s, v20.4s, v16.s[1]\n"
+    "fmla v22.4s, v20.4s, v16.s[2]\n"
+    "fmla v21.4s, v20.4s, v16.s[3]\n"
+    "ldr q16, [x28, #0x10]\n"
+    "ldr q20, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "lsl x27, x10, #0x2\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "fmla v9.4s, v20.4s, v19.s[0]\n"
+    "fmla v7.4s, v20.4s, v19.s[1]\n"
+    "fmla v6.4s, v20.4s, v19.s[2]\n"
+    "fmla v4.4s, v20.4s, v19.s[3]\n"
+    "fmla v3.4s, v20.4s, v18.s[0]\n"
+    "fmla v1.4s, v20.4s, v18.s[1]\n"
+    "fmla v31.4s, v20.4s, v18.s[2]\n"
+    "fmla v29.4s, v20.4s, v18.s[3]\n"
+    "fmla v28.4s, v20.4s, v17.s[0]\n"
+    "fmla v27.4s, v20.4s, v17.s[1]\n"
+    "fmla v26.4s, v20.4s, v17.s[2]\n"
+    "fmla v25.4s, v20.4s, v17.s[3]\n"
+    "fmla v24.4s, v20.4s, v16.s[0]\n"
+    "fmla v23.4s, v20.4s, v16.s[1]\n"
+    "fmla v22.4s, v20.4s, v16.s[2]\n"
+    "fmla v21.4s, v20.4s, v16.s[3]\n"
+    "fmin v9.4s, v9.4s, v10.4s\n"
+    "fmin v7.4s, v7.4s, v10.4s\n"
+    "fmin v6.4s, v6.4s, v10.4s\n"
+    "fmax v9.4s, v9.4s, v11.4s\n"
+    "str q9, [x19, x27]\n"
+    "fmax v7.4s, v7.4s, v11.4s\n"
+    "fmax v6.4s, v6.4s, v11.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmin v4.4s, v4.4s, v10.4s\n"
+    "str q7, [x20, x27]\n"
+    "fmin v3.4s, v3.4s, v10.4s\n"
+    "fmin v1.4s, v1.4s, v10.4s\n"
+    "str q6, [x21, x27]\n"
+    "fmax v4.4s, v4.4s, v11.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin v31.4s, v31.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax v3.4s, v3.4s, v11.4s\n"
+    "str q4, [x22, x27]\n"
+    "fmax v1.4s, v1.4s, v11.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax v31.4s, v31.4s, v11.4s\n"
+    "str q3, [x23, x27]\n"
+    "fmin v29.4s, v29.4s, v10.4s\n"
+    "str q1, [x24, x27]\n"
+    "fmin v28.4s, v28.4s, v10.4s\n"
+    "str q31, [x25, x27]\n"
+    "fmin v27.4s, v27.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax v29.4s, v29.4s, v11.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmax v28.4s, v28.4s, v11.4s\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax v27.4s, v27.4s, v11.4s\n"
+    "str q29, [x26, x27]\n"
+    "fmin v26.4s, v26.4s, v10.4s\n"
+    "str q28, [x19, x27]\n"
+    "fmin v25.4s, v25.4s, v10.4s\n"
+    "str q27, [x20, x27]\n"
+    "fmin v24.4s, v24.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax v26.4s, v26.4s, v11.4s\n"
+    "str q26, [x21, x27]\n"
+    "fmax v25.4s, v25.4s, v11.4s\n"
+    "fmax v24.4s, v24.4s, v11.4s\n"
+    "str q25, [x22, x27]\n"
+    "fmin v23.4s, v23.4s, v10.4s\n"
+    "fmin v22.4s, v22.4s, v10.4s\n"
+    "str q24, [x23, x27]\n"
+    "fmin v21.4s, v21.4s, v10.4s\n"
+    "fmax v23.4s, v23.4s, v11.4s\n"
+    "str q23, [x24, x27]\n"
+    "fmax v22.4s, v22.4s, v11.4s\n"
+    "fmax v21.4s, v21.4s, v11.4s\n"
+    "str q22, [x25, x27]\n"
+    "str q21, [x26, x27]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "lsl x27, x10, #0x2\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "ldr q5, [x24, #0x0]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "ldr q2, [x24, #0x10]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "ldr q0, [x28, #0x0]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "ldr q30, [x28, #0x10]\n"
+    "fmla v9.4s, v20.4s, v19.s[0]\n"
+    "ldr q8, [%x[weights], #0x0]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v7.4s, v20.4s, v19.s[1]\n"
+    "fmla v6.4s, v20.4s, v19.s[2]\n"
+    "fmla v4.4s, v20.4s, v19.s[3]\n"
+    "fmla v3.4s, v20.4s, v18.s[0]\n"
+    "fmla v1.4s, v20.4s, v18.s[1]\n"
+    "fmla v31.4s, v20.4s, v18.s[2]\n"
+    "fmla v29.4s, v20.4s, v18.s[3]\n"
+    "fmla v28.4s, v20.4s, v17.s[0]\n"
+    "fmla v27.4s, v20.4s, v17.s[1]\n"
+    "fmla v26.4s, v20.4s, v17.s[2]\n"
+    "fmla v25.4s, v20.4s, v17.s[3]\n"
+    "fmla v24.4s, v20.4s, v16.s[0]\n"
+    "fmla v23.4s, v20.4s, v16.s[1]\n"
+    "fmla v22.4s, v20.4s, v16.s[2]\n"
+    "fmla v21.4s, v20.4s, v16.s[3]\n"
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "fmin v9.4s, v9.4s, v10.4s\n"
+    "fmin v7.4s, v7.4s, v10.4s\n"
+    "fmin v6.4s, v6.4s, v10.4s\n"
+    "fmax v9.4s, v9.4s, v11.4s\n"
+    "str q9, [x19, x27]\n"
+    "fmax v7.4s, v7.4s, v11.4s\n"
+    "fmax v6.4s, v6.4s, v11.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmin v4.4s, v4.4s, v10.4s\n"
+    "str q7, [x20, x27]\n"
+    "fmin v3.4s, v3.4s, v10.4s\n"
+    "fmin v1.4s, v1.4s, v10.4s\n"
+    "str q6, [x21, x27]\n"
+    "fmax v4.4s, v4.4s, v11.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin v31.4s, v31.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax v3.4s, v3.4s, v11.4s\n"
+    "str q4, [x22, x27]\n"
+    "fmax v1.4s, v1.4s, v11.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax v31.4s, v31.4s, v11.4s\n"
+    "str q3, [x23, x27]\n"
+    "fmin v29.4s, v29.4s, v10.4s\n"
+    "str q1, [x24, x27]\n"
+    "fmin v28.4s, v28.4s, v10.4s\n"
+    "str q31, [x25, x27]\n"
+    "fmin v27.4s, v27.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax v29.4s, v29.4s, v11.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmax v28.4s, v28.4s, v11.4s\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax v27.4s, v27.4s, v11.4s\n"
+    "str q29, [x26, x27]\n"
+    "fmin v26.4s, v26.4s, v10.4s\n"
+    "str q28, [x19, x27]\n"
+    "fmin v25.4s, v25.4s, v10.4s\n"
+    "str q27, [x20, x27]\n"
+    "fmin v24.4s, v24.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax v26.4s, v26.4s, v11.4s\n"
+    "str q26, [x21, x27]\n"
+    "fmax v25.4s, v25.4s, v11.4s\n"
+    "fmax v24.4s, v24.4s, v11.4s\n"
+    "str q25, [x22, x27]\n"
+    "fmin v23.4s, v23.4s, v10.4s\n"
+    "fmin v22.4s, v22.4s, v10.4s\n"
+    "str q24, [x23, x27]\n"
+    "fmin v21.4s, v21.4s, v10.4s\n"
+    "fmax v23.4s, v23.4s, v11.4s\n"
+    "str q23, [x24, x27]\n"
+    "fmax v22.4s, v22.4s, v11.4s\n"
+    "fmax v21.4s, v21.4s, v11.4s\n"
+    "str q22, [x25, x27]\n"
+    "str q21, [x26, x27]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "lsl x27, x10, #0x2\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "fmin v9.4s, v9.4s, v10.4s\n"
+    "fmin v7.4s, v7.4s, v10.4s\n"
+    "fmin v6.4s, v6.4s, v10.4s\n"
+    "fmax v9.4s, v9.4s, v11.4s\n"
+    "str q9, [x19, x27]\n"
+    "fmax v7.4s, v7.4s, v11.4s\n"
+    "fmax v6.4s, v6.4s, v11.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmin v4.4s, v4.4s, v10.4s\n"
+    "str q7, [x20, x27]\n"
+    "fmin v3.4s, v3.4s, v10.4s\n"
+    "fmin v1.4s, v1.4s, v10.4s\n"
+    "str q6, [x21, x27]\n"
+    "fmax v4.4s, v4.4s, v11.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin v31.4s, v31.4s, v10.4s\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax v3.4s, v3.4s, v11.4s\n"
+    "str q4, [x22, x27]\n"
+    "fmax v1.4s, v1.4s, v11.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax v31.4s, v31.4s, v11.4s\n"
+    "str q3, [x23, x27]\n"
+    "fmin v29.4s, v29.4s, v10.4s\n"
+    "str q1, [x24, x27]\n"
+    "fmin v28.4s, v28.4s, v10.4s\n"
+    "str q31, [x25, x27]\n"
+    "fmin v27.4s, v27.4s, v10.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmax v29.4s, v29.4s, v11.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmax v28.4s, v28.4s, v11.4s\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax v27.4s, v27.4s, v11.4s\n"
+    "str q29, [x26, x27]\n"
+    "fmin v26.4s, v26.4s, v10.4s\n"
+    "str q28, [x19, x27]\n"
+    "fmin v25.4s, v25.4s, v10.4s\n"
+    "str q27, [x20, x27]\n"
+    "fmin v24.4s, v24.4s, v10.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax v26.4s, v26.4s, v11.4s\n"
+    "str q26, [x21, x27]\n"
+    "fmax v25.4s, v25.4s, v11.4s\n"
+    "fmax v24.4s, v24.4s, v11.4s\n"
+    "str q25, [x22, x27]\n"
+    "fmin v23.4s, v23.4s, v10.4s\n"
+    "fmin v22.4s, v22.4s, v10.4s\n"
+    "str q24, [x23, x27]\n"
+    "fmin v21.4s, v21.4s, v10.4s\n"
+    "fmax v23.4s, v23.4s, v11.4s\n"
+    "str q23, [x24, x27]\n"
+    "fmax v22.4s, v22.4s, v11.4s\n"
+    "fmax v21.4s, v21.4s, v11.4s\n"
+    "str q22, [x25, x27]\n"
+    "str q21, [x26, x27]\n"
+    "7:"  // Output channel loop: Done
+    "add x10, x10, #0x4\n"
+    "cmp x10, x9, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 19f\n"
+    "8:"  // Output channel oddments
+    "movi v16.16b, #0x0\n"
+    "cbz %x[bias], 11f\n"
+    "add x19, %x[bias], x10, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 9f\n"
+    "ld1 { v16.d }[0], [x19], #0x8\n"
+    "tbz %x[n_output_channels], #0, 10f\n"
+    "ld1 { v16.s }[2], [x19]\n"
+    "b 10f\n"
+    "9:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 10f\n"
+    "ld1 { v16.s }[0], [x19]\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: End
+
+    "11:"  // Output channel oddments: Load bias: Done
+    "mov v9.16b, v16.16b\n"
+    "ldr q8, [%x[weights], #0x0]\n"
+    "mov x19, %x[inptrs]\n"
+    "mov v7.16b, v16.16b\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "mov v6.16b, v16.16b\n"
+    "ldr q5, [x24, #0x0]\n"
+    "mov v4.16b, v16.16b\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "mov v3.16b, v16.16b\n"
+    "ldr q2, [x24, #0x10]\n"
+    "mov v1.16b, v16.16b\n"
+    "ldr q0, [x28, #0x0]\n"
+    "mov v31.16b, v16.16b\n"
+    "ldr q30, [x28, #0x10]\n"
+    "mov v29.16b, v16.16b\n"
+    "mov v28.16b, v16.16b\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "cbz x20, 15f\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "ldr q20, [%x[weights], #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "ldr q19, [x24, #0x0]\n"
+    "ldr q18, [x24, #0x10]\n"
+    "ldr q17, [x28, #0x0]\n"
+    "ldr q16, [x28, #0x10]\n"
+    "beq 13f\n"
+    "12:"  // Output channel oddments: Kernel loop
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "ldr q5, [x24, #0x0]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "ldr q2, [x24, #0x10]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "ldr q0, [x28, #0x0]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "ldr q30, [x28, #0x10]\n"
+    "fmla v9.4s, v20.4s, v19.s[0]\n"
+    "ldr q8, [%x[weights], #0x0]\n"
+    "fmla v7.4s, v20.4s, v19.s[1]\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "fmla v6.4s, v20.4s, v19.s[2]\n"
+    "fmla v4.4s, v20.4s, v19.s[3]\n"
+    "ldr q19, [x24, #0x0]\n"
+    "fmla v3.4s, v20.4s, v18.s[0]\n"
+    "fmla v1.4s, v20.4s, v18.s[1]\n"
+    "fmla v31.4s, v20.4s, v18.s[2]\n"
+    "fmla v29.4s, v20.4s, v18.s[3]\n"
+    "ldr q18, [x24, #0x10]\n"
+    "fmla v28.4s, v20.4s, v17.s[0]\n"
+    "fmla v27.4s, v20.4s, v17.s[1]\n"
+    "fmla v26.4s, v20.4s, v17.s[2]\n"
+    "fmla v25.4s, v20.4s, v17.s[3]\n"
+    "ldr q17, [x28, #0x0]\n"
+    "fmla v24.4s, v20.4s, v16.s[0]\n"
+    "fmla v23.4s, v20.4s, v16.s[1]\n"
+    "fmla v22.4s, v20.4s, v16.s[2]\n"
+    "fmla v21.4s, v20.4s, v16.s[3]\n"
+    "ldr q16, [x28, #0x10]\n"
+    "ldr q20, [%x[weights], #0x10]\n"
+    "add %x[weights], %x[weights], #0x20\n"
+    "bgt 12b\n"
+    "13:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 14f\n"
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "fmla v9.4s, v20.4s, v19.s[0]\n"
+    "fmla v7.4s, v20.4s, v19.s[1]\n"
+    "fmla v6.4s, v20.4s, v19.s[2]\n"
+    "fmla v4.4s, v20.4s, v19.s[3]\n"
+    "fmla v3.4s, v20.4s, v18.s[0]\n"
+    "fmla v1.4s, v20.4s, v18.s[1]\n"
+    "fmla v31.4s, v20.4s, v18.s[2]\n"
+    "fmla v29.4s, v20.4s, v18.s[3]\n"
+    "fmla v28.4s, v20.4s, v17.s[0]\n"
+    "fmla v27.4s, v20.4s, v17.s[1]\n"
+    "fmla v26.4s, v20.4s, v17.s[2]\n"
+    "fmla v25.4s, v20.4s, v17.s[3]\n"
+    "fmla v24.4s, v20.4s, v16.s[0]\n"
+    "fmla v23.4s, v20.4s, v16.s[1]\n"
+    "fmla v22.4s, v20.4s, v16.s[2]\n"
+    "fmla v21.4s, v20.4s, v16.s[3]\n"
+    "b 16f\n"
+    "14:"  // Output channel oddments: Odd tail
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "ldp x24, x28, [x19], #0x10\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "ldr q5, [x24, #0x0]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "ldr q2, [x24, #0x10]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "ldr q0, [x28, #0x0]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "ldr q30, [x28, #0x10]\n"
+    "fmla v9.4s, v20.4s, v19.s[0]\n"
+    "ldr q8, [%x[weights], #0x0]\n"
+    "add %x[weights], %x[weights], #0x10\n"
+    "fmla v7.4s, v20.4s, v19.s[1]\n"
+    "fmla v6.4s, v20.4s, v19.s[2]\n"
+    "fmla v4.4s, v20.4s, v19.s[3]\n"
+    "fmla v3.4s, v20.4s, v18.s[0]\n"
+    "fmla v1.4s, v20.4s, v18.s[1]\n"
+    "fmla v31.4s, v20.4s, v18.s[2]\n"
+    "fmla v29.4s, v20.4s, v18.s[3]\n"
+    "fmla v28.4s, v20.4s, v17.s[0]\n"
+    "fmla v27.4s, v20.4s, v17.s[1]\n"
+    "fmla v26.4s, v20.4s, v17.s[2]\n"
+    "fmla v25.4s, v20.4s, v17.s[3]\n"
+    "fmla v24.4s, v20.4s, v16.s[0]\n"
+    "fmla v23.4s, v20.4s, v16.s[1]\n"
+    "fmla v22.4s, v20.4s, v16.s[2]\n"
+    "fmla v21.4s, v20.4s, v16.s[3]\n"
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "b 16f\n"
+    "15:"  // Output channel oddments: Single kernel point
+    "fmla v9.4s, v8.4s, v5.s[0]\n"
+    "fmla v7.4s, v8.4s, v5.s[1]\n"
+    "fmla v6.4s, v8.4s, v5.s[2]\n"
+    "fmla v4.4s, v8.4s, v5.s[3]\n"
+    "fmla v3.4s, v8.4s, v2.s[0]\n"
+    "fmla v1.4s, v8.4s, v2.s[1]\n"
+    "fmla v31.4s, v8.4s, v2.s[2]\n"
+    "fmla v29.4s, v8.4s, v2.s[3]\n"
+    "fmla v28.4s, v8.4s, v0.s[0]\n"
+    "fmla v27.4s, v8.4s, v0.s[1]\n"
+    "fmla v26.4s, v8.4s, v0.s[2]\n"
+    "fmla v25.4s, v8.4s, v0.s[3]\n"
+    "fmla v24.4s, v8.4s, v30.s[0]\n"
+    "fmla v23.4s, v8.4s, v30.s[1]\n"
+    "fmla v22.4s, v8.4s, v30.s[2]\n"
+    "fmla v21.4s, v8.4s, v30.s[3]\n"
+    "16:"  // Output channel oddments: Done
+    "fmin v9.4s, v9.4s, v10.4s\n"
+    "fmin v7.4s, v7.4s, v10.4s\n"
+    "fmin v6.4s, v6.4s, v10.4s\n"
+    "fmin v4.4s, v4.4s, v10.4s\n"
+    "fmax v9.4s, v9.4s, v11.4s\n"
+    "fmax v7.4s, v7.4s, v11.4s\n"
+    "fmax v6.4s, v6.4s, v11.4s\n"
+    "fmax v4.4s, v4.4s, v11.4s\n"
+    "fmin v3.4s, v3.4s, v10.4s\n"
+    "fmin v1.4s, v1.4s, v10.4s\n"
+    "fmin v31.4s, v31.4s, v10.4s\n"
+    "fmax v3.4s, v3.4s, v11.4s\n"
+    "fmax v1.4s, v1.4s, v11.4s\n"
+    "fmax v31.4s, v31.4s, v11.4s\n"
+    "fmin v29.4s, v29.4s, v10.4s\n"
+    "fmin v28.4s, v28.4s, v10.4s\n"
+    "fmin v27.4s, v27.4s, v10.4s\n"
+    "fmax v29.4s, v29.4s, v11.4s\n"
+    "fmax v28.4s, v28.4s, v11.4s\n"
+    "fmax v27.4s, v27.4s, v11.4s\n"
+    "fmin v26.4s, v26.4s, v10.4s\n"
+    "fmin v25.4s, v25.4s, v10.4s\n"
+    "fmin v24.4s, v24.4s, v10.4s\n"
+    "fmax v26.4s, v26.4s, v11.4s\n"
+    "fmax v25.4s, v25.4s, v11.4s\n"
+    "fmax v24.4s, v24.4s, v11.4s\n"
+    "fmin v23.4s, v23.4s, v10.4s\n"
+    "fmin v22.4s, v22.4s, v10.4s\n"
+    "fmin v21.4s, v21.4s, v10.4s\n"
+    "fmax v23.4s, v23.4s, v11.4s\n"
+    "fmax v22.4s, v22.4s, v11.4s\n"
+    "fmax v21.4s, v21.4s, v11.4s\n"
+    "tbz %x[n_output_channels], #1, 17f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v9.d }[0], [x19]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v7.d }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v6.d }[0], [x21]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v4.d }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v3.d }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v1.d }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v31.d }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #2\n"
+    "st1 { v29.d }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v28.d }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v27.d }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v26.d }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v25.d }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v24.d }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v23.d }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v22.d }[0], [x25]\n"
+    "add x10, x10, #0x2\n"
+    "st1 { v21.d }[0], [x26]\n"
+    "tbz %x[n_output_channels], #0, 18f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v9.s }[2], [x19]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v7.s }[2], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v6.s }[2], [x21]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v4.s }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v3.s }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v1.s }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v31.s }[2], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #2\n"
+    "st1 { v29.s }[2], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v28.s }[2], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v27.s }[2], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v26.s }[2], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v25.s }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v24.s }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v23.s }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v22.s }[2], [x25]\n"
+    "st1 { v21.s }[2], [x26]\n"
+    "b 18f\n"
+    "17:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 18f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x10, LSL #2\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v9.s }[0], [x19]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v7.s }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v6.s }[0], [x21]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v4.s }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v3.s }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v1.s }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v31.s }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x10, LSL #2\n"
+    "st1 { v29.s }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x10, LSL #2\n"
+    "st1 { v28.s }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x10, LSL #2\n"
+    "st1 { v27.s }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x10, LSL #2\n"
+    "st1 { v26.s }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x10, LSL #2\n"
+    "st1 { v25.s }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x10, LSL #2\n"
+    "st1 { v24.s }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x10, LSL #2\n"
+    "st1 { v23.s }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x10, LSL #2\n"
+    "st1 { v22.s }[0], [x25]\n"
+    "st1 { v21.s }[0], [x26]\n"
+    "18:"  // Output channel oddments: Done: Store: Bit 1: End
+
+    "19:"  // Done
+
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..c76cb99
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size;
+
+  kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+  a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..ed8cd48
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+  __asm__ __volatile__(
+    "ldp x13, x12, [%x[inptrs], #0x0]\n"
+    "add SP, SP, #-0x80\n"
+    "ldp x11, x10, [%x[inptrs], #0x10]\n"
+    "mov x19, #0x1\n"
+    "ldp x9, x28, [%x[inptrs], #0x20]\n"
+    "orr x19, x19, #0x100\n"
+    "ldp x27, x26, [%x[inptrs], #0x30]\n"
+    "orr x19, x19, #0x10000\n"
+    "dup v11.4s, w19\n"
+    "ldp x25, x24, [%x[outptrs], #0x0]\n"
+    "mov x23, #0x0\n"
+    "ldp x22, x21, [%x[outptrs], #0x10]\n"
+    "lsr x20, %x[n_channels], #0x4\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v9.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v12.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.4s }, [x19]\n"
+    "cbz x20, 2f\n"
+    "1:"  // Loop
+    "movi v15.4s, #0x0\n"
+    "ldr q27, [x13, x23]\n"
+    "subs x20, x20, #0x1\n"
+    "movi v10.4s, #0x0\n"
+    "ldr q1, [x12, x23]\n"
+    "ldp x13, x12, [%x[inptrs], #0x40]\n"
+    "ldr q25, [x11, x23]\n"
+    "zip1 v7.16b, v27.16b, v25.16b\n"
+    "ldr q23, [x10, x23]\n"
+    "zip2 v5.16b, v27.16b, v25.16b\n"
+    "ldp x11, x10, [%x[inptrs], #0x50]\n"
+    "ldr q31, [x9, x23]\n"
+    "zip1 v8.16b, v1.16b, v23.16b\n"
+    "ldr q28, [x28, x23]\n"
+    "zip2 v3.16b, v1.16b, v23.16b\n"
+    "ldp x9, x28, [%x[inptrs], #0x60]\n"
+    "zip1 v6.16b, v7.16b, v8.16b\n"
+    "ldr q21, [x27, x23]\n"
+    "zip2 v8.16b, v7.16b, v8.16b\n"
+    "ldr q26, [x26, x23]\n"
+    "zip1 v7.16b, v5.16b, v3.16b\n"
+    "ldp x27, x26, [%x[inptrs], #0x70]\n"
+    "zip2 v5.16b, v5.16b, v3.16b\n"
+    "ldr q24, [x13, x23]\n"
+    "ldr q22, [x12, x23]\n"
+    "zip1 v2.16b, v31.16b, v21.16b\n"
+    "zip2 v4.16b, v31.16b, v21.16b\n"
+    "ldp x13, x12, [%x[inptrs], #0x0]\n"
+    "zip1 v1.16b, v28.16b, v26.16b\n"
+    "ldr q20, [x11, x23]\n"
+    "zip2 v31.16b, v28.16b, v26.16b\n"
+    "ldr q16, [x10, x23]\n"
+    "zip1 v3.16b, v2.16b, v1.16b\n"
+    "ldp x11, x10, [%x[inptrs], #0x10]\n"
+    "zip2 v2.16b, v2.16b, v1.16b\n"
+    "ldr q19, [x9, x23]\n"
+    "zip1 v1.16b, v4.16b, v31.16b\n"
+    "ldr q0, [x28, x23]\n"
+    "zip1 v28.16b, v24.16b, v20.16b\n"
+    "ldp x9, x28, [%x[inptrs], #0x20]\n"
+    "zip2 v26.16b, v24.16b, v20.16b\n"
+    "ldr q18, [x27, x23]\n"
+    "zip1 v24.16b, v22.16b, v16.16b\n"
+    "ldr q17, [x26, x23]\n"
+    "zip2 v22.16b, v22.16b, v16.16b\n"
+    "ldp x27, x26, [%x[inptrs], #0x30]\n"
+    "zip2 v16.16b, v4.16b, v31.16b\n"
+    "str q7, [SP, #0x0]\n"
+    "zip1 v31.16b, v28.16b, v24.16b\n"
+    "str q5, [SP, #0x10]\n"
+    "zip1 v20.16b, v19.16b, v18.16b\n"
+    "str q1, [SP, #0x20]\n"
+    "zip2 v19.16b, v19.16b, v18.16b\n"
+    "str q16, [SP, #0x30]\n"
+    "zip1 v18.16b, v0.16b, v17.16b\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "zip2 v17.16b, v0.16b, v17.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "zip2 v28.16b, v28.16b, v24.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "zip1 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x40]\n"
+    "zip2 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x50]\n"
+    "zip1 v26.16b, v20.16b, v18.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "zip2 v24.16b, v20.16b, v18.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "zip1 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x60]\n"
+    "zip2 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x70]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "mov v20.16b, v30.16b\n"
+    "mov v19.16b, v30.16b\n"
+    ".inst 0x4e8697be  // sdot v30.4s, v29.16b, v6.16b\n"
+    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
+    ".inst 0x4e83956f  // sdot v15.4s, v11.16b, v3.16b\n"
+    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
+    ".inst 0x4e9f956f  // sdot v15.4s, v11.16b, v31.16b\n"
+    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x4e86956f  // sdot v15.4s, v11.16b, v6.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
+    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x4e8697b6  // sdot v22.4s, v29.16b, v6.16b\n"
+    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
+    "ldr q29, [%x[params], #0x70]\n"
+    ".inst 0x4e83956a  // sdot v10.4s, v11.16b, v3.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
+    "ldr q3, [SP, #0x20]\n"
+    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
+    "ldr q27, [%x[params], #0x80]\n"
+    ".inst 0x4e9f956a  // sdot v10.4s, v11.16b, v31.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
+    "ldr q31, [SP, #0x40]\n"
+    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
+    "ldr q25, [%x[params], #0x90]\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x4e86956a  // sdot v10.4s, v11.16b, v6.16b\n"
+    "ldr q6, [SP, #0x0]\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
+    "ldr q26, [SP, #0x60]\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "movi v15.4s, #0x0\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    ".inst 0x4e82956f  // sdot v15.4s, v11.16b, v2.16b\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0xa0]\n"
+    ".inst 0x4e9c956f  // sdot v15.4s, v11.16b, v28.16b\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "mov v17.16b, v15.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x25, x23]\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "ldr q30, [%x[params], #0x60]\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    ".inst 0x4e88956f  // sdot v15.4s, v11.16b, v8.16b\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0xb0]\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x22, x23]\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x23]\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x21, x23]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x23, x23, #0x4\n"
+    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "movi v10.4s, #0x0\n"
+    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
+    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
+    "ldr q29, [%x[params], #0xd0]\n"
+    ".inst 0x4e82956a  // sdot v10.4s, v11.16b, v2.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
+    "ldr q2, [SP, #0x30]\n"
+    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
+    "ldr q27, [%x[params], #0xe0]\n"
+    ".inst 0x4e9c956a  // sdot v10.4s, v11.16b, v28.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
+    "ldr q28, [SP, #0x50]\n"
+    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
+    "ldr q25, [%x[params], #0xf0]\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x4e88956a  // sdot v10.4s, v11.16b, v8.16b\n"
+    "ldr q8, [SP, #0x10]\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
+    "ldr q24, [SP, #0x70]\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "movi v15.4s, #0x0\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e83956f  // sdot v15.4s, v11.16b, v3.16b\n"
+    "movi v10.4s, #0x0\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x100]\n"
+    ".inst 0x4e9f956f  // sdot v15.4s, v11.16b, v31.16b\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "mov v17.16b, v15.16b\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0x110]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x25, x23]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "ldr q30, [%x[params], #0xc0]\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "str s20, [x22, x23]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x23]\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    ".inst 0x4e86956f  // sdot v15.4s, v11.16b, v6.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
+    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x21, x23]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x23, x23, #0x4\n"
+    ".inst 0x4e8697be  // sdot v30.4s, v29.16b, v6.16b\n"
+    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
+    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
+    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x4e8697b6  // sdot v22.4s, v29.16b, v6.16b\n"
+    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
+    "ldr q29, [%x[params], #0x130]\n"
+    ".inst 0x4e83956a  // sdot v10.4s, v11.16b, v3.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
+    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
+    "ldr q27, [%x[params], #0x140]\n"
+    ".inst 0x4e9f956a  // sdot v10.4s, v11.16b, v31.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
+    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
+    "ldr q25, [%x[params], #0x150]\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x4e86956a  // sdot v10.4s, v11.16b, v6.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "movi v15.4s, #0x0\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    ".inst 0x4e82956f  // sdot v15.4s, v11.16b, v2.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "movi v10.4s, #0x0\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    ".inst 0x4e9c956f  // sdot v15.4s, v11.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x160]\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "mov v17.16b, v15.16b\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0x170]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x25, x23]\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "ldr q30, [%x[params], #0x120]\n"
+    "add %x[params], %x[params], #0x180\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x22, x23]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    ".inst 0x4e88956f  // sdot v15.4s, v11.16b, v8.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x23]\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x21, x23]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x23, x23, #0x4\n"
+    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
+    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x4e82956a  // sdot v10.4s, v11.16b, v2.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
+    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x4e9c956a  // sdot v10.4s, v11.16b, v28.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
+    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x4e88956a  // sdot v10.4s, v11.16b, v8.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x25, x23]\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x23]\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x22, x23]\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x21, x23]\n"
+    "add x23, x23, #0x4\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0xf\n"
+    "beq 34f\n"
+    "2:"  // Oddments
+    "and x19, %x[n_channels], #0xf\n"
+    "add x13, x13, x23\n"
+    "add x12, x12, x23\n"
+    "add x11, x11, x23\n"
+    "add x10, x10, x23\n"
+    "add x9, x9, x23\n"
+    "add x28, x28, x23\n"
+    "add x27, x27, x23\n"
+    "add x26, x26, x23\n"
+    "tbz %x[n_channels], #3, 6f\n"
+    "ld1 { v27.d }[0], [x13], #0x8\n"
+    "ld1 { v1.d }[0], [x12], #0x8\n"
+    "ld1 { v25.d }[0], [x11], #0x8\n"
+    "ld1 { v23.d }[0], [x10], #0x8\n"
+    "ld1 { v31.d }[0], [x9], #0x8\n"
+    "ld1 { v28.d }[0], [x28], #0x8\n"
+    "ld1 { v21.d }[0], [x27], #0x8\n"
+    "ld1 { v26.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #2, 4f\n"
+    "ld1 { v27.s }[2], [x13], #0x4\n"
+    "ld1 { v1.s }[2], [x12], #0x4\n"
+    "ld1 { v25.s }[2], [x11], #0x4\n"
+    "ld1 { v23.s }[2], [x10], #0x4\n"
+    "ld1 { v31.s }[2], [x9], #0x4\n"
+    "ld1 { v28.s }[2], [x28], #0x4\n"
+    "ld1 { v21.s }[2], [x27], #0x4\n"
+    "ld1 { v26.s }[2], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 3f\n"
+    "ld1 { v27.h }[6], [x13], #0x2\n"
+    "ld1 { v1.h }[6], [x12], #0x2\n"
+    "ld1 { v25.h }[6], [x11], #0x2\n"
+    "ld1 { v23.h }[6], [x10], #0x2\n"
+    "ld1 { v31.h }[6], [x9], #0x2\n"
+    "ld1 { v28.h }[6], [x28], #0x2\n"
+    "ld1 { v21.h }[6], [x27], #0x2\n"
+    "ld1 { v26.h }[6], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[14], [x13], #0x1\n"
+    "ld1 { v1.b }[14], [x12], #0x1\n"
+    "ld1 { v25.b }[14], [x11], #0x1\n"
+    "ld1 { v23.b }[14], [x10], #0x1\n"
+    "ld1 { v31.b }[14], [x9], #0x1\n"
+    "ld1 { v28.b }[14], [x28], #0x1\n"
+    "ld1 { v21.b }[14], [x27], #0x1\n"
+    "ld1 { v26.b }[14], [x26], #0x1\n"
+    "b 10f\n"
+    "3:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[12], [x13], #0x1\n"
+    "ld1 { v1.b }[12], [x12], #0x1\n"
+    "ld1 { v25.b }[12], [x11], #0x1\n"
+    "ld1 { v23.b }[12], [x10], #0x1\n"
+    "ld1 { v31.b }[12], [x9], #0x1\n"
+    "ld1 { v28.b }[12], [x28], #0x1\n"
+    "ld1 { v21.b }[12], [x27], #0x1\n"
+    "ld1 { v26.b }[12], [x26], #0x1\n"
+    "b 10f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v27.h }[4], [x13], #0x2\n"
+    "ld1 { v1.h }[4], [x12], #0x2\n"
+    "ld1 { v25.h }[4], [x11], #0x2\n"
+    "ld1 { v23.h }[4], [x10], #0x2\n"
+    "ld1 { v31.h }[4], [x9], #0x2\n"
+    "ld1 { v28.h }[4], [x28], #0x2\n"
+    "ld1 { v21.h }[4], [x27], #0x2\n"
+    "ld1 { v26.h }[4], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[10], [x13], #0x1\n"
+    "ld1 { v1.b }[10], [x12], #0x1\n"
+    "ld1 { v25.b }[10], [x11], #0x1\n"
+    "ld1 { v23.b }[10], [x10], #0x1\n"
+    "ld1 { v31.b }[10], [x9], #0x1\n"
+    "ld1 { v28.b }[10], [x28], #0x1\n"
+    "ld1 { v21.b }[10], [x27], #0x1\n"
+    "ld1 { v26.b }[10], [x26], #0x1\n"
+    "b 10f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[8], [x13], #0x1\n"
+    "ld1 { v1.b }[8], [x12], #0x1\n"
+    "ld1 { v25.b }[8], [x11], #0x1\n"
+    "ld1 { v23.b }[8], [x10], #0x1\n"
+    "ld1 { v31.b }[8], [x9], #0x1\n"
+    "ld1 { v28.b }[8], [x28], #0x1\n"
+    "ld1 { v21.b }[8], [x27], #0x1\n"
+    "ld1 { v26.b }[8], [x26], #0x1\n"
+    "b 10f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 8f\n"
+    "ld1 { v27.s }[0], [x13], #0x4\n"
+    "ld1 { v1.s }[0], [x12], #0x4\n"
+    "ld1 { v25.s }[0], [x11], #0x4\n"
+    "ld1 { v23.s }[0], [x10], #0x4\n"
+    "ld1 { v31.s }[0], [x9], #0x4\n"
+    "ld1 { v28.s }[0], [x28], #0x4\n"
+    "ld1 { v21.s }[0], [x27], #0x4\n"
+    "ld1 { v26.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v27.h }[2], [x13], #0x2\n"
+    "ld1 { v1.h }[2], [x12], #0x2\n"
+    "ld1 { v25.h }[2], [x11], #0x2\n"
+    "ld1 { v23.h }[2], [x10], #0x2\n"
+    "ld1 { v31.h }[2], [x9], #0x2\n"
+    "ld1 { v28.h }[2], [x28], #0x2\n"
+    "ld1 { v21.h }[2], [x27], #0x2\n"
+    "ld1 { v26.h }[2], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[6], [x13], #0x1\n"
+    "ld1 { v1.b }[6], [x12], #0x1\n"
+    "ld1 { v25.b }[6], [x11], #0x1\n"
+    "ld1 { v23.b }[6], [x10], #0x1\n"
+    "ld1 { v31.b }[6], [x9], #0x1\n"
+    "ld1 { v28.b }[6], [x28], #0x1\n"
+    "ld1 { v21.b }[6], [x27], #0x1\n"
+    "ld1 { v26.b }[6], [x26], #0x1\n"
+    "b 10f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[4], [x13], #0x1\n"
+    "ld1 { v1.b }[4], [x12], #0x1\n"
+    "ld1 { v25.b }[4], [x11], #0x1\n"
+    "ld1 { v23.b }[4], [x10], #0x1\n"
+    "ld1 { v31.b }[4], [x9], #0x1\n"
+    "ld1 { v28.b }[4], [x28], #0x1\n"
+    "ld1 { v21.b }[4], [x27], #0x1\n"
+    "ld1 { v26.b }[4], [x26], #0x1\n"
+    "b 10f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v27.h }[0], [x13], #0x2\n"
+    "ld1 { v1.h }[0], [x12], #0x2\n"
+    "ld1 { v25.h }[0], [x11], #0x2\n"
+    "ld1 { v23.h }[0], [x10], #0x2\n"
+    "ld1 { v31.h }[0], [x9], #0x2\n"
+    "ld1 { v28.h }[0], [x28], #0x2\n"
+    "ld1 { v21.h }[0], [x27], #0x2\n"
+    "ld1 { v26.h }[0], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[2], [x13], #0x1\n"
+    "ld1 { v1.b }[2], [x12], #0x1\n"
+    "ld1 { v25.b }[2], [x11], #0x1\n"
+    "ld1 { v23.b }[2], [x10], #0x1\n"
+    "ld1 { v31.b }[2], [x9], #0x1\n"
+    "ld1 { v28.b }[2], [x28], #0x1\n"
+    "ld1 { v21.b }[2], [x27], #0x1\n"
+    "ld1 { v26.b }[2], [x26], #0x1\n"
+    "b 10f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[0], [x13], #0x1\n"
+    "ld1 { v1.b }[0], [x12], #0x1\n"
+    "ld1 { v25.b }[0], [x11], #0x1\n"
+    "ld1 { v23.b }[0], [x10], #0x1\n"
+    "ld1 { v31.b }[0], [x9], #0x1\n"
+    "ld1 { v28.b }[0], [x28], #0x1\n"
+    "ld1 { v21.b }[0], [x27], #0x1\n"
+    "ld1 { v26.b }[0], [x26], #0x1\n"
+    "10:"  // Oddments: Load (A): Bit 3: End
+    "ldp x13, x12, [%x[inptrs], #0x40]\n"
+    "add x13, x13, x23\n"
+    "ldp x11, x10, [%x[inptrs], #0x50]\n"
+    "ldp x9, x28, [%x[inptrs], #0x60]\n"
+    "add x12, x12, x23\n"
+    "ldp x27, x26, [%x[inptrs], #0x70]\n"
+    "add x11, x11, x23\n"
+    "add x10, x10, x23\n"
+    "add x9, x9, x23\n"
+    "add x28, x28, x23\n"
+    "add x27, x27, x23\n"
+    "add x26, x26, x23\n"
+    "tbz %x[n_channels], #3, 14f\n"
+    "ld1 { v24.d }[0], [x13], #0x8\n"
+    "ld1 { v22.d }[0], [x12], #0x8\n"
+    "ld1 { v20.d }[0], [x11], #0x8\n"
+    "ld1 { v16.d }[0], [x10], #0x8\n"
+    "ld1 { v19.d }[0], [x9], #0x8\n"
+    "ld1 { v0.d }[0], [x28], #0x8\n"
+    "ld1 { v18.d }[0], [x27], #0x8\n"
+    "ld1 { v17.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #2, 12f\n"
+    "ld1 { v24.s }[2], [x13], #0x4\n"
+    "ld1 { v22.s }[2], [x12], #0x4\n"
+    "ld1 { v20.s }[2], [x11], #0x4\n"
+    "ld1 { v16.s }[2], [x10], #0x4\n"
+    "ld1 { v19.s }[2], [x9], #0x4\n"
+    "ld1 { v0.s }[2], [x28], #0x4\n"
+    "ld1 { v18.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ld1 { v24.h }[6], [x13], #0x2\n"
+    "ld1 { v22.h }[6], [x12], #0x2\n"
+    "ld1 { v20.h }[6], [x11], #0x2\n"
+    "ld1 { v16.h }[6], [x10], #0x2\n"
+    "ld1 { v19.h }[6], [x9], #0x2\n"
+    "ld1 { v0.h }[6], [x28], #0x2\n"
+    "ld1 { v18.h }[6], [x27], #0x2\n"
+    "ld1 { v17.h }[6], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[14], [x13], #0x1\n"
+    "ld1 { v22.b }[14], [x12], #0x1\n"
+    "ld1 { v20.b }[14], [x11], #0x1\n"
+    "ld1 { v16.b }[14], [x10], #0x1\n"
+    "ld1 { v19.b }[14], [x9], #0x1\n"
+    "ld1 { v0.b }[14], [x28], #0x1\n"
+    "ld1 { v18.b }[14], [x27], #0x1\n"
+    "ld1 { v17.b }[14], [x26], #0x1\n"
+    "b 18f\n"
+    "11:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[12], [x13], #0x1\n"
+    "ld1 { v22.b }[12], [x12], #0x1\n"
+    "ld1 { v20.b }[12], [x11], #0x1\n"
+    "ld1 { v16.b }[12], [x10], #0x1\n"
+    "ld1 { v19.b }[12], [x9], #0x1\n"
+    "ld1 { v0.b }[12], [x28], #0x1\n"
+    "ld1 { v18.b }[12], [x27], #0x1\n"
+    "ld1 { v17.b }[12], [x26], #0x1\n"
+    "b 18f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v24.h }[4], [x13], #0x2\n"
+    "ld1 { v22.h }[4], [x12], #0x2\n"
+    "ld1 { v20.h }[4], [x11], #0x2\n"
+    "ld1 { v16.h }[4], [x10], #0x2\n"
+    "ld1 { v19.h }[4], [x9], #0x2\n"
+    "ld1 { v0.h }[4], [x28], #0x2\n"
+    "ld1 { v18.h }[4], [x27], #0x2\n"
+    "ld1 { v17.h }[4], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[10], [x13], #0x1\n"
+    "ld1 { v22.b }[10], [x12], #0x1\n"
+    "ld1 { v20.b }[10], [x11], #0x1\n"
+    "ld1 { v16.b }[10], [x10], #0x1\n"
+    "ld1 { v19.b }[10], [x9], #0x1\n"
+    "ld1 { v0.b }[10], [x28], #0x1\n"
+    "ld1 { v18.b }[10], [x27], #0x1\n"
+    "ld1 { v17.b }[10], [x26], #0x1\n"
+    "b 18f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[8], [x13], #0x1\n"
+    "ld1 { v22.b }[8], [x12], #0x1\n"
+    "ld1 { v20.b }[8], [x11], #0x1\n"
+    "ld1 { v16.b }[8], [x10], #0x1\n"
+    "ld1 { v19.b }[8], [x9], #0x1\n"
+    "ld1 { v0.b }[8], [x28], #0x1\n"
+    "ld1 { v18.b }[8], [x27], #0x1\n"
+    "ld1 { v17.b }[8], [x26], #0x1\n"
+    "b 18f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 16f\n"
+    "ld1 { v24.s }[0], [x13], #0x4\n"
+    "ld1 { v22.s }[0], [x12], #0x4\n"
+    "ld1 { v20.s }[0], [x11], #0x4\n"
+    "ld1 { v16.s }[0], [x10], #0x4\n"
+    "ld1 { v19.s }[0], [x9], #0x4\n"
+    "ld1 { v0.s }[0], [x28], #0x4\n"
+    "ld1 { v18.s }[0], [x27], #0x4\n"
+    "ld1 { v17.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ld1 { v24.h }[2], [x13], #0x2\n"
+    "ld1 { v22.h }[2], [x12], #0x2\n"
+    "ld1 { v20.h }[2], [x11], #0x2\n"
+    "ld1 { v16.h }[2], [x10], #0x2\n"
+    "ld1 { v19.h }[2], [x9], #0x2\n"
+    "ld1 { v0.h }[2], [x28], #0x2\n"
+    "ld1 { v18.h }[2], [x27], #0x2\n"
+    "ld1 { v17.h }[2], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[6], [x13], #0x1\n"
+    "ld1 { v22.b }[6], [x12], #0x1\n"
+    "ld1 { v20.b }[6], [x11], #0x1\n"
+    "ld1 { v16.b }[6], [x10], #0x1\n"
+    "ld1 { v19.b }[6], [x9], #0x1\n"
+    "ld1 { v0.b }[6], [x28], #0x1\n"
+    "ld1 { v18.b }[6], [x27], #0x1\n"
+    "ld1 { v17.b }[6], [x26], #0x1\n"
+    "b 18f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[4], [x13], #0x1\n"
+    "ld1 { v22.b }[4], [x12], #0x1\n"
+    "ld1 { v20.b }[4], [x11], #0x1\n"
+    "ld1 { v16.b }[4], [x10], #0x1\n"
+    "ld1 { v19.b }[4], [x9], #0x1\n"
+    "ld1 { v0.b }[4], [x28], #0x1\n"
+    "ld1 { v18.b }[4], [x27], #0x1\n"
+    "ld1 { v17.b }[4], [x26], #0x1\n"
+    "b 18f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v24.h }[0], [x13], #0x2\n"
+    "ld1 { v22.h }[0], [x12], #0x2\n"
+    "ld1 { v20.h }[0], [x11], #0x2\n"
+    "ld1 { v16.h }[0], [x10], #0x2\n"
+    "ld1 { v19.h }[0], [x9], #0x2\n"
+    "ld1 { v0.h }[0], [x28], #0x2\n"
+    "ld1 { v18.h }[0], [x27], #0x2\n"
+    "ld1 { v17.h }[0], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[2], [x13], #0x1\n"
+    "ld1 { v22.b }[2], [x12], #0x1\n"
+    "ld1 { v20.b }[2], [x11], #0x1\n"
+    "ld1 { v16.b }[2], [x10], #0x1\n"
+    "ld1 { v19.b }[2], [x9], #0x1\n"
+    "ld1 { v0.b }[2], [x28], #0x1\n"
+    "ld1 { v18.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "b 18f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[0], [x13], #0x1\n"
+    "ld1 { v22.b }[0], [x12], #0x1\n"
+    "ld1 { v20.b }[0], [x11], #0x1\n"
+    "ld1 { v16.b }[0], [x10], #0x1\n"
+    "ld1 { v19.b }[0], [x9], #0x1\n"
+    "ld1 { v0.b }[0], [x28], #0x1\n"
+    "ld1 { v18.b }[0], [x27], #0x1\n"
+    "ld1 { v17.b }[0], [x26], #0x1\n"
+    "18:"  // Oddments: Load (B): Bit 3: End
+    "zip1 v7.16b, v27.16b, v25.16b\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "cmp x19, #0x4\n"
+    "zip2 v5.16b, v27.16b, v25.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "zip1 v8.16b, v1.16b, v23.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "zip2 v3.16b, v1.16b, v23.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "zip1 v2.16b, v31.16b, v21.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "zip2 v4.16b, v31.16b, v21.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "zip1 v1.16b, v28.16b, v26.16b\n"
+    "zip2 v31.16b, v28.16b, v26.16b\n"
+    "zip1 v28.16b, v24.16b, v20.16b\n"
+    "zip2 v26.16b, v24.16b, v20.16b\n"
+    "zip1 v24.16b, v22.16b, v16.16b\n"
+    "zip2 v22.16b, v22.16b, v16.16b\n"
+    "zip1 v20.16b, v19.16b, v18.16b\n"
+    "zip2 v19.16b, v19.16b, v18.16b\n"
+    "zip1 v18.16b, v0.16b, v17.16b\n"
+    "zip2 v17.16b, v0.16b, v17.16b\n"
+    "zip1 v6.16b, v7.16b, v8.16b\n"
+    "zip2 v8.16b, v7.16b, v8.16b\n"
+    "zip1 v7.16b, v5.16b, v3.16b\n"
+    "str q7, [SP, #0x0]\n"
+    "zip2 v5.16b, v5.16b, v3.16b\n"
+    "str q5, [SP, #0x10]\n"
+    "zip1 v3.16b, v2.16b, v1.16b\n"
+    "zip2 v2.16b, v2.16b, v1.16b\n"
+    "zip1 v1.16b, v4.16b, v31.16b\n"
+    "str q1, [SP, #0x20]\n"
+    "zip2 v16.16b, v4.16b, v31.16b\n"
+    "str q16, [SP, #0x30]\n"
+    "zip1 v31.16b, v28.16b, v24.16b\n"
+    "zip2 v28.16b, v28.16b, v24.16b\n"
+    "zip1 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x40]\n"
+    "zip2 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x50]\n"
+    "zip1 v26.16b, v20.16b, v18.16b\n"
+    "zip2 v24.16b, v20.16b, v18.16b\n"
+    "zip1 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x60]\n"
+    "zip2 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x70]\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "mov v19.16b, v30.16b\n"
+    ".inst 0x4e8697be  // sdot v30.4s, v29.16b, v6.16b\n"
+    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
+    "movi v15.4s, #0x0\n"
+    ".inst 0x4e83956f  // sdot v15.4s, v11.16b, v3.16b\n"
+    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
+    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    ".inst 0x4e9f956f  // sdot v15.4s, v11.16b, v31.16b\n"
+    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
+    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x4e86956f  // sdot v15.4s, v11.16b, v6.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
+    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x4e8697b6  // sdot v22.4s, v29.16b, v6.16b\n"
+    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
+    "movi v10.4s, #0x0\n"
+    ".inst 0x4e83956a  // sdot v10.4s, v11.16b, v3.16b\n"
+    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
+    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x4e9f956a  // sdot v10.4s, v11.16b, v31.16b\n"
+    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
+    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x4e86956a  // sdot v10.4s, v11.16b, v6.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 19f\n"
+    "str s30, [x25, x23]\n"
+    "str s22, [x24, x23]\n"
+    "str s20, [x22, x23]\n"
+    "str s19, [x21, x23]\n"
+    "b 22f\n"
+    "19:"  // Oddments: Unroll 0: Oddment store
+    "add x25, x25, x23\n"
+    "add x24, x24, x23\n"
+    "add x22, x22, x23\n"
+    "add x21, x21, x23\n"
+    "tbz x19, #1, 20f\n"
+    "st1 { v30.h }[0], [x25], #0x2\n"
+    "st1 { v22.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x22], #0x2\n"
+    "st1 { v19.h }[0], [x21], #0x2\n"
+    "tbz x19, #0, 21f\n"
+    "st1 { v30.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v20.b }[2], [x22], #0x1\n"
+    "st1 { v19.b }[2], [x21], #0x1\n"
+    "b 21f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 21f\n"
+    "st1 { v30.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v20.b }[0], [x22], #0x1\n"
+    "st1 { v19.b }[0], [x21], #0x1\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+
+    "22:"  // Oddments: Unroll 0: After oddment store
+    "add x23, x23, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "movi v15.4s, #0x0\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    ".inst 0x4e82956f  // sdot v15.4s, v11.16b, v2.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "cmp x19, #0x4\n"
+    "movi v10.4s, #0x0\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "mov v19.16b, v30.16b\n"
+    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
+    ".inst 0x4e9c956f  // sdot v15.4s, v11.16b, v28.16b\n"
+    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x4e88956f  // sdot v15.4s, v11.16b, v8.16b\n"
+    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
+    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x4e82956a  // sdot v10.4s, v11.16b, v2.16b\n"
+    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x4e9c956a  // sdot v10.4s, v11.16b, v28.16b\n"
+    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
+    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x4e88956a  // sdot v10.4s, v11.16b, v8.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 23f\n"
+    "str s30, [x25, x23]\n"
+    "str s22, [x24, x23]\n"
+    "str s20, [x22, x23]\n"
+    "str s19, [x21, x23]\n"
+    "b 26f\n"
+    "23:"  // Oddments: Unroll 1: Oddment store
+    "add x25, x25, x23\n"
+    "add x24, x24, x23\n"
+    "add x22, x22, x23\n"
+    "add x21, x21, x23\n"
+    "tbz x19, #1, 24f\n"
+    "st1 { v30.h }[0], [x25], #0x2\n"
+    "st1 { v22.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x22], #0x2\n"
+    "st1 { v19.h }[0], [x21], #0x2\n"
+    "tbz x19, #0, 25f\n"
+    "st1 { v30.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v20.b }[2], [x22], #0x1\n"
+    "st1 { v19.b }[2], [x21], #0x1\n"
+    "b 25f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 25f\n"
+    "st1 { v30.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v20.b }[0], [x22], #0x1\n"
+    "st1 { v19.b }[0], [x21], #0x1\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+
+    "26:"  // Oddments: Unroll 1: After oddment store
+    "add x23, x23, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "movi v15.4s, #0x0\n"
+    "ldr q6, [SP, #0x0]\n"
+    "movi v10.4s, #0x0\n"
+    "ldr q3, [SP, #0x20]\n"
+    "cmp x19, #0x4\n"
+    ".inst 0x4e83956f  // sdot v15.4s, v11.16b, v3.16b\n"
+    "ldr q31, [SP, #0x40]\n"
+    "ldr q26, [SP, #0x60]\n"
+    ".inst 0x4e9f956f  // sdot v15.4s, v11.16b, v31.16b\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "mov v19.16b, v30.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    ".inst 0x4e8697be  // sdot v30.4s, v29.16b, v6.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x4e86956f  // sdot v15.4s, v11.16b, v6.16b\n"
+    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
+    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
+    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
+    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x4e8697b6  // sdot v22.4s, v29.16b, v6.16b\n"
+    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
+    ".inst 0x4e83956a  // sdot v10.4s, v11.16b, v3.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
+    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
+    ".inst 0x4e9f956a  // sdot v10.4s, v11.16b, v31.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
+    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x4e86956a  // sdot v10.4s, v11.16b, v6.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 27f\n"
+    "str s30, [x25, x23]\n"
+    "str s22, [x24, x23]\n"
+    "str s20, [x22, x23]\n"
+    "str s19, [x21, x23]\n"
+    "b 30f\n"
+    "27:"  // Oddments: Unroll 2: Oddment store
+    "add x25, x25, x23\n"
+    "add x24, x24, x23\n"
+    "add x22, x22, x23\n"
+    "add x21, x21, x23\n"
+    "tbz x19, #1, 28f\n"
+    "st1 { v30.h }[0], [x25], #0x2\n"
+    "st1 { v22.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x22], #0x2\n"
+    "st1 { v19.h }[0], [x21], #0x2\n"
+    "tbz x19, #0, 29f\n"
+    "st1 { v30.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v20.b }[2], [x22], #0x1\n"
+    "st1 { v19.b }[2], [x21], #0x1\n"
+    "b 29f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 29f\n"
+    "st1 { v30.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v20.b }[0], [x22], #0x1\n"
+    "st1 { v19.b }[0], [x21], #0x1\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+
+    "30:"  // Oddments: Unroll 2: After oddment store
+    "add x23, x23, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "movi v15.4s, #0x0\n"
+    "ldr q8, [SP, #0x10]\n"
+    "movi v10.4s, #0x0\n"
+    "ldr q2, [SP, #0x30]\n"
+    "ldr q28, [SP, #0x50]\n"
+    ".inst 0x4e82956f  // sdot v15.4s, v11.16b, v2.16b\n"
+    "ldr q24, [SP, #0x70]\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "mov v19.16b, v30.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    ".inst 0x4e9c956f  // sdot v15.4s, v11.16b, v28.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x4e88956f  // sdot v15.4s, v11.16b, v8.16b\n"
+    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
+    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
+    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x4e82956a  // sdot v10.4s, v11.16b, v2.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
+    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x4e9c956a  // sdot v10.4s, v11.16b, v28.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
+    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x4e88956a  // sdot v10.4s, v11.16b, v8.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "31:"  // Oddments: Unroll 3: Oddment store
+    "add x25, x25, x23\n"
+    "add x24, x24, x23\n"
+    "add x22, x22, x23\n"
+    "add x21, x21, x23\n"
+    "tbz x19, #1, 32f\n"
+    "st1 { v30.h }[0], [x25], #0x2\n"
+    "st1 { v22.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x22], #0x2\n"
+    "st1 { v19.h }[0], [x21], #0x2\n"
+    "tbz x19, #0, 33f\n"
+    "st1 { v30.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v20.b }[2], [x22], #0x1\n"
+    "st1 { v19.b }[2], [x21], #0x1\n"
+    "b 33f\n"
+    "32:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 33f\n"
+    "st1 { v30.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v20.b }[0], [x22], #0x1\n"
+    "st1 { v19.b }[0], [x21], #0x1\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+
+    "34:"  // End
+    "add SP, SP, #0x80\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..76c927a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+  a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..3001276
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,1192 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x17, #0x0\n"
+    "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x15, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x12, x8, #0x3\n"
+    "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v14.16b }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v9.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v15.4s }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v24.4s }, [x20]\n"
+    "ld1r { v12.4s }, [x19]\n"
+    "ldp x10, x9, [x21, #0x0]\n"
+    "ldp x28, x27, [x21, #0x10]\n"
+    "cbz x12, 3f\n"
+    "subs x12, x12, #0x1\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q13, [x19, #0x0]\n"
+    "mov v17.16b, v13.16b\n"
+    "ldr q19, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v16.16b, v13.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v9.8b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v21.16b, v19.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "ssubl v1.8h, v1.8b, v9.8b\n"
+    "mov v20.16b, v19.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "ldr d4, [x16, #0x20]\n"
+    "ssubl v2.8h, v2.8b, v9.8b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "ssubl v3.8h, v3.8b, v9.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "ldr d7, [x16, #0x38]\n"
+    "ssubl v4.8h, v4.8b, v9.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "ssubl v5.8h, v5.8b, v9.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "ssubl v6.8h, v6.8b, v9.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "ssubl v7.8h, v7.8b, v9.8b\n"
+    "ssubl v8.8h, v8.8b, v9.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "ldr d31, [x23, x17]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "ldr d30, [x22, x17]\n"
+    "ldr d29, [x21, x17]\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "ldr d28, [x20, x17]\n"
+    "ldr d27, [x19, x17]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v13.4s, v31.4h, v4.4h\n"
+    "ldr x21, [x14, #0x28]\n"
+    "add x16, x16, #0x48\n"
+    "smlal2 v19.4s, v31.8h, v4.8h\n"
+    "ldr x20, [x14, #0x30]\n"
+    "subs x12, x12, #0x1\n"
+    "smlal v17.4s, v31.4h, v3.4h\n"
+    "ldr x26, [x14, #0x38]\n"
+    "smlal2 v25.4s, v31.8h, v3.8h\n"
+    "ldr x25, [x14, #0x40]\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "ldr x19, [x14, #0x48]\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "ldr x24, [x14, #0x50]\n"
+    "smlal v23.4s, v31.4h, v0.4h\n"
+    "ldr x23, [x14, #0x58]\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x21, x17]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "ldr x22, [x14, #0x60]\n"
+    "smlal2 v19.4s, v30.8h, v0.8h\n"
+    "ldr d30, [x19, x17]\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "ldr x21, [x14, #0x68]\n"
+    "smlal2 v25.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v13.4s, v28.4h, v5.4h\n"
+    "ldr x20, [x14, #0x70]\n"
+    "smlal2 v19.4s, v28.8h, v5.8h\n"
+    "ldr x19, [x14, #0x78]\n"
+    "smlal v17.4s, v28.4h, v4.4h\n"
+    "ldr q26, [x13, #0x0]\n"
+    "smlal2 v25.4s, v28.8h, v4.8h\n"
+    "ldr q10, [x11, #0x0]\n"
+    "smlal v16.4s, v28.4h, v2.4h\n"
+    "ldr q11, [x13, #0x10]\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v21.4s, v28.8h, v2.8h\n"
+    "ldr q18, [x11, #0x10]\n"
+    "add x11, x11, #0x20\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x26, x17]\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v16.4s, v31.4h, v6.4h\n"
+    "smlal2 v21.4s, v31.8h, v6.8h\n"
+    "ldr d31, [x25, x17]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v27.4h, v7.4h\n"
+    "smlal2 v19.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v27.4h, v6.4h\n"
+    "smlal2 v25.4s, v27.8h, v6.8h\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "smlal v23.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "smlal v13.4s, v28.4h, v1.4h\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "smlal v23.4s, v29.4h, v8.4h\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x24, x17]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v0.4h\n"
+    "smlal2 v25.4s, v28.8h, v0.8h\n"
+    "ldr d28, [x23, x17]\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v2.4h\n"
+    "smlal2 v19.4s, v31.8h, v2.8h\n"
+    "smlal v17.4s, v31.4h, v1.4h\n"
+    "smlal2 v25.4s, v31.8h, v1.8h\n"
+    "ldr d31, [x22, x17]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v8.4h\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "smlal v17.4s, v30.4h, v7.4h\n"
+    "smlal2 v25.4s, v30.8h, v7.8h\n"
+    "smlal v16.4s, v30.4h, v5.4h\n"
+    "smlal2 v21.4s, v30.8h, v5.8h\n"
+    "smlal v23.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x17]\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v13.4s, v29.4h, v3.4h\n"
+    "smlal2 v19.4s, v29.8h, v3.8h\n"
+    "smlal v16.4s, v29.4h, v0.4h\n"
+    "smlal2 v21.4s, v29.8h, v0.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v5.4h\n"
+    "smlal2 v25.4s, v28.8h, v5.8h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v20.4s, v28.8h, v2.8h\n"
+    "ldr d28, [x19, x17]\n"
+    "add x17, x17, #0x8\n"
+    "smlal v13.4s, v31.4h, v6.4h\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal2 v19.4s, v31.8h, v6.8h\n"
+    "smlal v16.4s, v31.4h, v3.4h\n"
+    "smlal2 v21.4s, v31.8h, v3.8h\n"
+    "smlal v17.4s, v30.4h, v8.4h\n"
+    "smlal2 v25.4s, v30.8h, v8.8h\n"
+    "smlal v23.4s, v30.4h, v5.4h\n"
+    "smlal2 v20.4s, v30.8h, v5.8h\n"
+    "smlal v16.4s, v29.4h, v7.4h\n"
+    "smlal2 v21.4s, v29.8h, v7.8h\n"
+    "smlal v23.4s, v29.4h, v6.4h\n"
+    "smlal2 v20.4s, v29.8h, v6.8h\n"
+    "smlal v16.4s, v28.4h, v8.4h\n"
+    "smlal2 v21.4s, v28.8h, v8.8h\n"
+    "smlal v23.4s, v28.4h, v7.4h\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+    "and v22.16b, v13.16b, v10.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v18.16b\n"
+    "and v3.16b, v17.16b, v10.16b\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v6.16b, v25.16b, v18.16b\n"
+    "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sqadd v13.4s, v13.4s, v22.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+    "and v0.16b, v16.16b, v10.16b\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v10.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "sqadd v17.4s, v17.4s, v3.4s\n"
+    "sqadd v25.4s, v25.4s, v6.4s\n"
+    "and v29.16b, v21.16b, v18.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "add v13.4s, v13.4s, v15.4s\n"
+    "srshl v19.4s, v19.4s, v18.4s\n"
+    "srshl v17.4s, v17.4s, v10.4s\n"
+    "srshl v25.4s, v25.4s, v18.4s\n"
+    "smin v13.4s, v13.4s, v12.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "add v17.4s, v17.4s, v15.4s\n"
+    "smax v13.4s, v13.4s, v24.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "smin v17.4s, v17.4s, v12.4s\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v24.4s\n"
+    "smax v17.4s, v17.4s, v24.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "uzp1 v13.16b, v13.16b, v19.16b\n"
+    "sqadd v16.4s, v16.4s, v0.4s\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x10, x15]\n"
+    "smax v25.4s, v25.4s, v24.4s\n"
+    "sqadd v21.4s, v21.4s, v29.4s\n"
+    "srshl v16.4s, v16.4s, v10.4s\n"
+    "and v3.16b, v23.16b, v10.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "uzp1 v17.16b, v17.16b, v25.16b\n"
+    "add v16.4s, v16.4s, v15.4s\n"
+    "srshl v21.4s, v21.4s, v18.4s\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "str d17, [x9, x15]\n"
+    "smin v16.4s, v16.4s, v12.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "sqadd v23.4s, v23.4s, v3.4s\n"
+    "smax v16.4s, v16.4s, v24.4s\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "and v25.16b, v20.16b, v18.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "smax v21.4s, v21.4s, v24.4s\n"
+    "srshl v23.4s, v23.4s, v10.4s\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x28, x15]\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "smax v23.4s, v23.4s, v24.4s\n"
+    "srshl v20.4s, v20.4s, v18.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v24.4s\n"
+    "uzp1 v23.16b, v23.16b, v20.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d23, [x27, x15]\n"
+    "add x15, x15, #0x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q13, [x19, #0x0]\n"
+    "mov v17.16b, v13.16b\n"
+    "ldr q19, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v16.16b, v13.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v9.8b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v21.16b, v19.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "ssubl v1.8h, v1.8b, v9.8b\n"
+    "mov v20.16b, v19.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "ldr d4, [x16, #0x20]\n"
+    "ssubl v2.8h, v2.8b, v9.8b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "ssubl v3.8h, v3.8b, v9.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "ldr d7, [x16, #0x38]\n"
+    "ssubl v4.8h, v4.8b, v9.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "ssubl v5.8h, v5.8b, v9.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "ssubl v6.8h, v6.8b, v9.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "ssubl v7.8h, v7.8b, v9.8b\n"
+    "ssubl v8.8h, v8.8b, v9.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "ldr d31, [x23, x17]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "ldr d30, [x22, x17]\n"
+    "ldr d29, [x21, x17]\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "ldr d28, [x20, x17]\n"
+    "ldr d27, [x19, x17]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v13.4s, v31.4h, v4.4h\n"
+    "ldr x21, [x14, #0x28]\n"
+    "tst x8, #0x7\n"
+    "smlal2 v19.4s, v31.8h, v4.8h\n"
+    "ldr x20, [x14, #0x30]\n"
+    "smlal v17.4s, v31.4h, v3.4h\n"
+    "ldr x26, [x14, #0x38]\n"
+    "smlal2 v25.4s, v31.8h, v3.8h\n"
+    "ldr x25, [x14, #0x40]\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "ldr x19, [x14, #0x48]\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "ldr x24, [x14, #0x50]\n"
+    "smlal v23.4s, v31.4h, v0.4h\n"
+    "ldr x23, [x14, #0x58]\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x21, x17]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "ldr x22, [x14, #0x60]\n"
+    "smlal2 v19.4s, v30.8h, v0.8h\n"
+    "ldr d30, [x19, x17]\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "ldr x21, [x14, #0x68]\n"
+    "smlal2 v25.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v13.4s, v28.4h, v5.4h\n"
+    "ldr x20, [x14, #0x70]\n"
+    "smlal2 v19.4s, v28.8h, v5.8h\n"
+    "ldr x19, [x14, #0x78]\n"
+    "smlal v17.4s, v28.4h, v4.4h\n"
+    "ldr q26, [x13, #0x0]\n"
+    "smlal2 v25.4s, v28.8h, v4.8h\n"
+    "ldr q10, [x11, #0x0]\n"
+    "smlal v16.4s, v28.4h, v2.4h\n"
+    "ldr q11, [x13, #0x10]\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v21.4s, v28.8h, v2.8h\n"
+    "ldr q18, [x11, #0x10]\n"
+    "add x11, x11, #0x20\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x26, x17]\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v16.4s, v31.4h, v6.4h\n"
+    "smlal2 v21.4s, v31.8h, v6.8h\n"
+    "ldr d31, [x25, x17]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v27.4h, v7.4h\n"
+    "smlal2 v19.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v27.4h, v6.4h\n"
+    "smlal2 v25.4s, v27.8h, v6.8h\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "smlal v23.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "smlal v13.4s, v28.4h, v1.4h\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "smlal v23.4s, v29.4h, v8.4h\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x24, x17]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v0.4h\n"
+    "smlal2 v25.4s, v28.8h, v0.8h\n"
+    "ldr d28, [x23, x17]\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v2.4h\n"
+    "smlal2 v19.4s, v31.8h, v2.8h\n"
+    "smlal v17.4s, v31.4h, v1.4h\n"
+    "smlal2 v25.4s, v31.8h, v1.8h\n"
+    "ldr d31, [x22, x17]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v8.4h\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "smlal v17.4s, v30.4h, v7.4h\n"
+    "smlal2 v25.4s, v30.8h, v7.8h\n"
+    "smlal v16.4s, v30.4h, v5.4h\n"
+    "smlal2 v21.4s, v30.8h, v5.8h\n"
+    "smlal v23.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x17]\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v13.4s, v29.4h, v3.4h\n"
+    "smlal2 v19.4s, v29.8h, v3.8h\n"
+    "smlal v16.4s, v29.4h, v0.4h\n"
+    "smlal2 v21.4s, v29.8h, v0.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v5.4h\n"
+    "smlal2 v25.4s, v28.8h, v5.8h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v20.4s, v28.8h, v2.8h\n"
+    "ldr d28, [x19, x17]\n"
+    "add x17, x17, #0x8\n"
+    "smlal v13.4s, v31.4h, v6.4h\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal2 v19.4s, v31.8h, v6.8h\n"
+    "smlal v16.4s, v31.4h, v3.4h\n"
+    "smlal2 v21.4s, v31.8h, v3.8h\n"
+    "smlal v17.4s, v30.4h, v8.4h\n"
+    "smlal2 v25.4s, v30.8h, v8.8h\n"
+    "smlal v23.4s, v30.4h, v5.4h\n"
+    "smlal2 v20.4s, v30.8h, v5.8h\n"
+    "smlal v16.4s, v29.4h, v7.4h\n"
+    "smlal2 v21.4s, v29.8h, v7.8h\n"
+    "smlal v23.4s, v29.4h, v6.4h\n"
+    "smlal2 v20.4s, v29.8h, v6.8h\n"
+    "smlal v16.4s, v28.4h, v8.4h\n"
+    "smlal2 v21.4s, v28.8h, v8.8h\n"
+    "smlal v23.4s, v28.4h, v7.4h\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+    "and v22.16b, v13.16b, v10.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v18.16b\n"
+    "and v3.16b, v17.16b, v10.16b\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v6.16b, v25.16b, v18.16b\n"
+    "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sqadd v13.4s, v13.4s, v22.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+    "and v0.16b, v16.16b, v10.16b\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v10.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "sqadd v17.4s, v17.4s, v3.4s\n"
+    "sqadd v25.4s, v25.4s, v6.4s\n"
+    "and v29.16b, v21.16b, v18.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "add v13.4s, v13.4s, v15.4s\n"
+    "srshl v19.4s, v19.4s, v18.4s\n"
+    "srshl v17.4s, v17.4s, v10.4s\n"
+    "srshl v25.4s, v25.4s, v18.4s\n"
+    "smin v13.4s, v13.4s, v12.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "add v17.4s, v17.4s, v15.4s\n"
+    "smax v13.4s, v13.4s, v24.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "smin v17.4s, v17.4s, v12.4s\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v24.4s\n"
+    "smax v17.4s, v17.4s, v24.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "uzp1 v13.16b, v13.16b, v19.16b\n"
+    "sqadd v16.4s, v16.4s, v0.4s\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x10, x15]\n"
+    "smax v25.4s, v25.4s, v24.4s\n"
+    "sqadd v21.4s, v21.4s, v29.4s\n"
+    "srshl v16.4s, v16.4s, v10.4s\n"
+    "and v3.16b, v23.16b, v10.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "uzp1 v17.16b, v17.16b, v25.16b\n"
+    "add v16.4s, v16.4s, v15.4s\n"
+    "srshl v21.4s, v21.4s, v18.4s\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "str d17, [x9, x15]\n"
+    "smin v16.4s, v16.4s, v12.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "sqadd v23.4s, v23.4s, v3.4s\n"
+    "smax v16.4s, v16.4s, v24.4s\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "and v25.16b, v20.16b, v18.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "smax v21.4s, v21.4s, v24.4s\n"
+    "srshl v23.4s, v23.4s, v10.4s\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x28, x15]\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "smax v23.4s, v23.4s, v24.4s\n"
+    "srshl v20.4s, v20.4s, v18.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v24.4s\n"
+    "uzp1 v23.16b, v23.16b, v20.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d23, [x27, x15]\n"
+    "add x15, x15, #0x8\n"
+    "beq 64f\n"
+    "add x16, x16, #0x48\n"
+    "3:"  // Oddments
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x8, #2, 5f\n"
+    "ld1 { v13.4s }, [x19], #0x10\n"
+    "tbz x8, #1, 4f\n"
+    "ld1 { v19.d }[0], [x19], #0x8\n"
+    "tbz x8, #0, 7f\n"
+    "ld1 { v19.s }[2], [x19]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 7f\n"
+    "ld1 { v19.s }[0], [x19]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x8, #1, 6f\n"
+    "ld1 { v13.d }[0], [x19], #0x8\n"
+    "tbz x8, #0, 7f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 7f\n"
+    "ld1 { v13.s }[0], [x19]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v17.16b, v13.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v16.16b, v13.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "mov v21.16b, v19.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr d4, [x16, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v9.8b\n"
+    "mov v20.16b, v19.16b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "ssubl v1.8h, v1.8b, v9.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "ssubl v2.8h, v2.8b, v9.8b\n"
+    "ldr d7, [x16, #0x38]\n"
+    "ssubl v3.8h, v3.8b, v9.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "ssubl v4.8h, v4.8b, v9.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "ssubl v5.8h, v5.8b, v9.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "ssubl v6.8h, v6.8b, v9.8b\n"
+    "ssubl v7.8h, v7.8b, v9.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "ssubl v8.8h, v8.8b, v9.8b\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "add x19, x19, x17\n"
+    "tbz x8, #2, 9f\n"
+    "ld1 { v31.s }[0], [x23], #0x4\n"
+    "ld1 { v30.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "ld1 { v27.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 8f\n"
+    "ld1 { v31.h }[2], [x23], #0x2\n"
+    "ld1 { v30.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "ld1 { v27.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[6], [x23]\n"
+    "ld1 { v30.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "ld1 { v27.b }[6], [x19]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[4], [x23]\n"
+    "ld1 { v30.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "ld1 { v27.b }[4], [x19]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x8, #1, 10f\n"
+    "ld1 { v31.h }[0], [x23], #0x2\n"
+    "ld1 { v30.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "ld1 { v27.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[2], [x23]\n"
+    "ld1 { v30.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "ld1 { v27.b }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[0], [x23]\n"
+    "ld1 { v30.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "ld1 { v27.b }[0], [x19]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ldr x21, [x14, #0x28]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v4.4h\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "smlal2 v19.4s, v31.8h, v4.8h\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v31.4h, v3.4h\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal2 v25.4s, v31.8h, v3.8h\n"
+    "ssubl v27.8h, v27.8b, v14.8b\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "add x21, x21, x17\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "smlal v23.4s, v31.4h, v0.4h\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "smlal2 v19.4s, v30.8h, v0.8h\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "smlal2 v25.4s, v29.8h, v2.8h\n"
+    "smlal v13.4s, v28.4h, v5.4h\n"
+    "smlal2 v19.4s, v28.8h, v5.8h\n"
+    "smlal v17.4s, v28.4h, v4.4h\n"
+    "smlal2 v25.4s, v28.8h, v4.8h\n"
+    "smlal v16.4s, v28.4h, v2.4h\n"
+    "smlal2 v21.4s, v28.8h, v2.8h\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "tbz x8, #2, 13f\n"
+    "ld1 { v31.s }[0], [x21], #0x4\n"
+    "tbz x8, #1, 12f\n"
+    "ld1 { v31.h }[2], [x21], #0x2\n"
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[6], [x21]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[4], [x21]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x8, #1, 14f\n"
+    "ld1 { v31.h }[0], [x21], #0x2\n"
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[2], [x21]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[0], [x21]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "smlal v13.4s, v27.4h, v7.4h\n"
+    "ldr x20, [x14, #0x30]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal2 v19.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v27.4h, v6.4h\n"
+    "add x20, x20, x17\n"
+    "smlal2 v25.4s, v27.8h, v6.8h\n"
+    "smlal v23.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "smlal v16.4s, v31.4h, v6.4h\n"
+    "smlal2 v21.4s, v31.8h, v6.8h\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "tbz x8, #2, 17f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x8, #1, 16f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x8, #1, 18f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr x26, [x14, #0x38]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v23.4s, v29.4h, v8.4h\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "add x26, x26, x17\n"
+    "tbz x8, #2, 21f\n"
+    "ld1 { v28.s }[0], [x26], #0x4\n"
+    "tbz x8, #1, 20f\n"
+    "ld1 { v28.h }[2], [x26], #0x2\n"
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[6], [x26]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[4], [x26]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x8, #1, 22f\n"
+    "ld1 { v28.h }[0], [x26], #0x2\n"
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[2], [x26]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[0], [x26]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "ldr x25, [x14, #0x40]\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v13.4s, v28.4h, v1.4h\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "add x25, x25, x17\n"
+    "smlal v17.4s, v28.4h, v0.4h\n"
+    "smlal2 v25.4s, v28.8h, v0.8h\n"
+    "tbz x8, #2, 25f\n"
+    "ld1 { v31.s }[0], [x25], #0x4\n"
+    "tbz x8, #1, 24f\n"
+    "ld1 { v31.h }[2], [x25], #0x2\n"
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[6], [x25]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[4], [x25]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x8, #1, 26f\n"
+    "ld1 { v31.h }[0], [x25], #0x2\n"
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[2], [x25]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[0], [x25]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "ldr x19, [x14, #0x48]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v2.4h\n"
+    "smlal2 v19.4s, v31.8h, v2.8h\n"
+    "add x19, x19, x17\n"
+    "smlal v17.4s, v31.4h, v1.4h\n"
+    "smlal2 v25.4s, v31.8h, v1.8h\n"
+    "tbz x8, #2, 29f\n"
+    "ld1 { v30.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 28f\n"
+    "ld1 { v30.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[6], [x19]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[4], [x19]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x8, #1, 30f\n"
+    "ld1 { v30.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[2], [x19]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[0], [x19]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr x24, [x14, #0x50]\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v8.4h\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "add x24, x24, x17\n"
+    "smlal v17.4s, v30.4h, v7.4h\n"
+    "smlal2 v25.4s, v30.8h, v7.8h\n"
+    "smlal v16.4s, v30.4h, v5.4h\n"
+    "smlal2 v21.4s, v30.8h, v5.8h\n"
+    "smlal v23.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "tbz x8, #2, 33f\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "tbz x8, #1, 32f\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x8, #1, 34f\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "ldr x23, [x14, #0x58]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v13.4s, v29.4h, v3.4h\n"
+    "smlal2 v19.4s, v29.8h, v3.8h\n"
+    "add x23, x23, x17\n"
+    "smlal v16.4s, v29.4h, v0.4h\n"
+    "smlal2 v21.4s, v29.8h, v0.8h\n"
+    "tbz x8, #2, 37f\n"
+    "ld1 { v28.s }[0], [x23], #0x4\n"
+    "tbz x8, #1, 36f\n"
+    "ld1 { v28.h }[2], [x23], #0x2\n"
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[6], [x23]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[4], [x23]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x8, #1, 38f\n"
+    "ld1 { v28.h }[0], [x23], #0x2\n"
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[2], [x23]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[0], [x23]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "ldr x22, [x14, #0x60]\n"
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v5.4h\n"
+    "smlal2 v25.4s, v28.8h, v5.8h\n"
+    "add x22, x22, x17\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v20.4s, v28.8h, v2.8h\n"
+    "tbz x8, #2, 41f\n"
+    "ld1 { v31.s }[0], [x22], #0x4\n"
+    "tbz x8, #1, 40f\n"
+    "ld1 { v31.h }[2], [x22], #0x2\n"
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[6], [x22]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[4], [x22]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x8, #1, 42f\n"
+    "ld1 { v31.h }[0], [x22], #0x2\n"
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[2], [x22]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[0], [x22]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "ldr x21, [x14, #0x68]\n"
+    "ssubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v6.4h\n"
+    "smlal2 v19.4s, v31.8h, v6.8h\n"
+    "add x21, x21, x17\n"
+    "smlal v16.4s, v31.4h, v3.4h\n"
+    "smlal2 v21.4s, v31.8h, v3.8h\n"
+    "tbz x8, #2, 45f\n"
+    "ld1 { v30.s }[0], [x21], #0x4\n"
+    "tbz x8, #1, 44f\n"
+    "ld1 { v30.h }[2], [x21], #0x2\n"
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[6], [x21]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[4], [x21]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x8, #1, 46f\n"
+    "ld1 { v30.h }[0], [x21], #0x2\n"
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[2], [x21]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[0], [x21]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr x20, [x14, #0x70]\n"
+    "ssubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v17.4s, v30.4h, v8.4h\n"
+    "smlal2 v25.4s, v30.8h, v8.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v23.4s, v30.4h, v5.4h\n"
+    "smlal2 v20.4s, v30.8h, v5.8h\n"
+    "tbz x8, #2, 49f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x8, #1, 48f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x8, #1, 50f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr x19, [x14, #0x78]\n"
+    "ssubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v16.4s, v29.4h, v7.4h\n"
+    "smlal2 v21.4s, v29.8h, v7.8h\n"
+    "add x19, x19, x17\n"
+    "smlal v23.4s, v29.4h, v6.4h\n"
+    "smlal2 v20.4s, v29.8h, v6.8h\n"
+    "tbz x8, #2, 53f\n"
+    "ld1 { v28.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 52f\n"
+    "ld1 { v28.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[6], [x19]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[4], [x19]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x8, #1, 54f\n"
+    "ld1 { v28.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[2], [x19]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[0], [x19]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ssubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v16.4s, v28.4h, v8.4h\n"
+    "smlal2 v21.4s, v28.8h, v8.8h\n"
+    "smlal v23.4s, v28.4h, v7.4h\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "tbz x8, #2, 57f\n"
+    "ld1 { v26.4s }, [x13], #0x10\n"
+    "ld1 { v10.4s }, [x11], #0x10\n"
+    "tbz x8, #1, 56f\n"
+    "ld1 { v11.d }[0], [x13], #0x8\n"
+    "ld1 { v18.d }[0], [x11], #0x8\n"
+    "tbz x8, #0, 59f\n"
+    "ld1 { v11.s }[2], [x13]\n"
+    "ld1 { v18.s }[2], [x11]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 59f\n"
+    "ld1 { v11.s }[0], [x13]\n"
+    "ld1 { v18.s }[0], [x11]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x8, #1, 58f\n"
+    "ld1 { v26.d }[0], [x13], #0x8\n"
+    "ld1 { v10.d }[0], [x11], #0x8\n"
+    "tbz x8, #0, 59f\n"
+    "ld1 { v26.s }[2], [x13]\n"
+    "ld1 { v10.s }[2], [x11]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 59f\n"
+    "ld1 { v26.s }[0], [x13]\n"
+    "ld1 { v10.s }[0], [x11]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+    "add x10, x10, x15\n"
+    "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+    "add x9, x9, x15\n"
+    "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+    "add x28, x28, x15\n"
+    "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+    "add x27, x27, x15\n"
+    "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+    "and v22.16b, v13.16b, v10.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v18.16b\n"
+    "and v3.16b, v17.16b, v10.16b\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v6.16b, v25.16b, v18.16b\n"
+    "and v0.16b, v16.16b, v10.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sqadd v13.4s, v13.4s, v22.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "sqadd v17.4s, v17.4s, v3.4s\n"
+    "srshl v13.4s, v13.4s, v10.4s\n"
+    "sqadd v25.4s, v25.4s, v6.4s\n"
+    "srshl v19.4s, v19.4s, v18.4s\n"
+    "srshl v17.4s, v17.4s, v10.4s\n"
+    "add v13.4s, v13.4s, v15.4s\n"
+    "srshl v25.4s, v25.4s, v18.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "smin v13.4s, v13.4s, v12.4s\n"
+    "add v17.4s, v17.4s, v15.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "smax v13.4s, v13.4s, v24.4s\n"
+    "smin v17.4s, v17.4s, v12.4s\n"
+    "smax v19.4s, v19.4s, v24.4s\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v24.4s\n"
+    "uzp1 v13.16b, v13.16b, v19.16b\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "sqadd v16.4s, v16.4s, v0.4s\n"
+    "smax v25.4s, v25.4s, v24.4s\n"
+    "and v29.16b, v21.16b, v18.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "uzp1 v17.16b, v17.16b, v25.16b\n"
+    "srshl v16.4s, v16.4s, v10.4s\n"
+    "and v3.16b, v23.16b, v10.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "add v16.4s, v16.4s, v15.4s\n"
+    "sqadd v21.4s, v21.4s, v29.4s\n"
+    "and v25.16b, v20.16b, v18.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "smin v16.4s, v16.4s, v12.4s\n"
+    "srshl v21.4s, v21.4s, v18.4s\n"
+    "sqadd v23.4s, v23.4s, v3.4s\n"
+    "smax v16.4s, v16.4s, v24.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "srshl v23.4s, v23.4s, v10.4s\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "srshl v20.4s, v20.4s, v18.4s\n"
+    "smax v21.4s, v21.4s, v24.4s\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "smax v23.4s, v23.4s, v24.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v24.4s\n"
+    "uzp1 v23.16b, v23.16b, v20.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "tbz x8, #2, 61f\n"
+    "st1 { v13.s }[0], [x10], #0x4\n"
+    "st1 { v17.s }[0], [x9], #0x4\n"
+    "st1 { v16.s }[0], [x28], #0x4\n"
+    "st1 { v23.s }[0], [x27], #0x4\n"
+    "tbz x8, #1, 60f\n"
+    "st1 { v13.h }[2], [x10], #0x2\n"
+    "st1 { v17.h }[2], [x9], #0x2\n"
+    "st1 { v16.h }[2], [x28], #0x2\n"
+    "st1 { v23.h }[2], [x27], #0x2\n"
+    "tbz x8, #0, 63f\n"
+    "st1 { v13.b }[6], [x10], #0x1\n"
+    "st1 { v17.b }[6], [x9], #0x1\n"
+    "st1 { v16.b }[6], [x28], #0x1\n"
+    "st1 { v23.b }[6], [x27], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 63f\n"
+    "st1 { v13.b }[4], [x10], #0x1\n"
+    "st1 { v17.b }[4], [x9], #0x1\n"
+    "st1 { v16.b }[4], [x28], #0x1\n"
+    "st1 { v23.b }[4], [x27], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x8, #1, 62f\n"
+    "st1 { v13.h }[0], [x10], #0x2\n"
+    "st1 { v17.h }[0], [x9], #0x2\n"
+    "st1 { v16.h }[0], [x28], #0x2\n"
+    "st1 { v23.h }[0], [x27], #0x2\n"
+    "tbz x8, #0, 63f\n"
+    "st1 { v13.b }[2], [x10], #0x1\n"
+    "st1 { v17.b }[2], [x9], #0x1\n"
+    "st1 { v16.b }[2], [x28], #0x1\n"
+    "st1 { v23.b }[2], [x27], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 63f\n"
+    "st1 { v13.b }[0], [x10], #0x1\n"
+    "st1 { v17.b }[0], [x9], #0x1\n"
+    "st1 { v16.b }[0], [x28], #0x1\n"
+    "st1 { v23.b }[0], [x27], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+
+    "64:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..b20759e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+  a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..3b3d9c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x5, #0x0\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x7, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x8, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x16, x4, #0x3\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.16b }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v11.4s }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v19.4s }, [x20]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "ldp x12, x11, [x21, #0x10]\n"
+    "cbz x16, 3f\n"
+    "subs x16, x16, #0x1\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x19, #0x0]\n"
+    "mov v20.16b, v15.16b\n"
+    "ldr q10, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v16.16b, v15.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v17.16b, v15.16b\n"
+    "ldr d0, [x6, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "mov v23.16b, v10.16b\n"
+    "ldr d1, [x6, #0x8]\n"
+    "mov v22.16b, v10.16b\n"
+    "ldr d2, [x6, #0x10]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "mov v18.16b, v10.16b\n"
+    "ldr d3, [x6, #0x18]\n"
+    "ldr d4, [x6, #0x20]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr d5, [x6, #0x28]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ldr d6, [x6, #0x30]\n"
+    "ldr d7, [x6, #0x38]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ldr d8, [x6, #0x40]\n"
+    "ssubl v5.8h, v5.8b, v13.8b\n"
+    "ldp x26, x25, [x8, #0x0]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ldp x24, x23, [x8, #0x10]\n"
+    "ssubl v7.8h, v7.8b, v13.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldp x22, x21, [x8, #0x20]\n"
+    "ldp x20, x19, [x8, #0x30]\n"
+    "ldr d31, [x26, x5]\n"
+    "ssubl v31.8h, v31.8b, v12.8b\n"
+    "ldr d30, [x25, x5]\n"
+    "ldr d29, [x24, x5]\n"
+    "ssubl v30.8h, v30.8b, v12.8b\n"
+    "ldr d28, [x23, x5]\n"
+    "ldr d27, [x22, x5]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "ldr d26, [x21, x5]\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "ldr d25, [x20, x5]\n"
+    "ldr d24, [x19, x5]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v15.4s, v31.4h, v8.4h\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x6, x6, #0x48\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "ldr x22, [x8, #0x48]\n"
+    "subs x16, x16, #0x1\n"
+    "smlal v20.4s, v31.4h, v6.4h\n"
+    "ldr x21, [x8, #0x50]\n"
+    "smlal2 v23.4s, v31.8h, v6.8h\n"
+    "ldr x20, [x8, #0x58]\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "ldr x19, [x8, #0x60]\n"
+    "smlal2 v22.4s, v31.8h, v2.8h\n"
+    "ldr x10, [x8, #0x68]\n"
+    "smlal v17.4s, v31.4h, v0.4h\n"
+    "ldr x9, [x8, #0x70]\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x8, #0x78]\n"
+    "smlal v15.4s, v30.4h, v0.4h\n"
+    "ldr x27, [x8, #0x80]\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "ldr x26, [x8, #0x88]\n"
+    "smlal v20.4s, v28.4h, v1.4h\n"
+    "ldr x25, [x8, #0x90]\n"
+    "smlal2 v23.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x5]\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v15.4s, v29.4h, v1.4h\n"
+    "ldr x24, [x8, #0x98]\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "ldr d29, [x23, x5]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v27.4h, v2.4h\n"
+    "ldr x23, [x8, #0xa0]\n"
+    "smlal2 v23.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x5]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v15.4s, v26.4h, v3.4h\n"
+    "ldr x22, [x8, #0xa8]\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x20, x5]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v4.4h\n"
+    "ldr x21, [x8, #0xb0]\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "ldr d25, [x19, x5]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "ldr x20, [x8, #0xb8]\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "ldr x19, [x8, #0xc0]\n"
+    "smlal v20.4s, v24.4h, v0.4h\n"
+    "ldr q21, [x17, #0x0]\n"
+    "smlal2 v23.4s, v24.8h, v0.8h\n"
+    "ldr d24, [x9, x5]\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v4.4h\n"
+    "ldr q30, [x15, #0x0]\n"
+    "smlal2 v23.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x10, x5]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v5.4h\n"
+    "ldr q31, [x17, #0x10]\n"
+    "smlal2 v23.4s, v28.8h, v5.8h\n"
+    "ldr d28, [x27, x5]\n"
+    "add x17, x17, #0x20\n"
+    "smlal v15.4s, v27.4h, v5.4h\n"
+    "ldr q9, [x15, #0x10]\n"
+    "add x15, x15, #0x20\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v20.4s, v27.4h, v3.4h\n"
+    "smlal2 v23.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x28, x5]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v16.4s, v26.4h, v3.4h\n"
+    "smlal2 v22.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x26, x5]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v6.4h\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v22.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x25, x5]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v16.4s, v29.4h, v4.4h\n"
+    "smlal2 v22.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x24, x5]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v7.4h\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v22.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x22, x5]\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v17.4s, v27.4h, v4.4h\n"
+    "smlal2 v18.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x5]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "smlal2 v23.4s, v28.8h, v7.8h\n"
+    "smlal v17.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "smlal v16.4s, v25.4h, v6.4h\n"
+    "smlal2 v22.4s, v25.8h, v6.8h\n"
+    "ldr d25, [x20, x5]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v17.4s, v26.4h, v5.4h\n"
+    "smlal2 v18.4s, v26.8h, v5.8h\n"
+    "ldr d26, [x21, x5]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "smlal2 v23.4s, v29.8h, v8.8h\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "smlal2 v18.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x19, x5]\n"
+    "add x5, x5, #0x8\n"
+    "smlal v16.4s, v27.4h, v7.4h\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal2 v22.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v24.4h, v3.4h\n"
+    "smlal v16.4s, v24.4h, v5.4h\n"
+    "smlal2 v18.4s, v24.8h, v3.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+    "smlal2 v22.4s, v24.8h, v5.8h\n"
+    "smlal v17.4s, v26.4h, v7.4h\n"
+    "smlal2 v18.4s, v26.8h, v7.8h\n"
+    "smlal v16.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "smlal v17.4s, v25.4h, v6.4h\n"
+    "smlal2 v18.4s, v25.8h, v6.8h\n"
+    "and v26.16b, v15.16b, v30.16b\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "smlal v17.4s, v29.4h, v8.4h\n"
+    "smlal2 v18.4s, v29.8h, v8.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+    "sqadd v15.4s, v15.4s, v26.4s\n"
+    "and v8.16b, v10.16b, v9.16b\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "srshl v15.4s, v15.4s, v30.4s\n"
+    "and v4.16b, v20.16b, v30.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v2.16b, v23.16b, v9.16b\n"
+    "and v1.16b, v16.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v11.4s\n"
+    "sqadd v10.4s, v10.4s, v8.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+    "sqadd v20.4s, v20.4s, v4.4s\n"
+    "smin v15.4s, v15.4s, v14.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "smax v15.4s, v15.4s, v19.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "add v10.4s, v10.4s, v11.4s\n"
+    "srshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v1.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "smax v10.4s, v10.4s, v19.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "uzp1 v15.16b, v15.16b, v10.16b\n"
+    "smax v20.4s, v20.4s, v19.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x14, x7]\n"
+    "smax v23.4s, v23.4s, v19.4s\n"
+    "srshl v16.4s, v16.4s, v30.4s\n"
+    "and v24.16b, v22.16b, v9.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "uzp1 v20.16b, v20.16b, v23.16b\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d20, [x13, x7]\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "sqadd v22.4s, v22.4s, v24.4s\n"
+    "and v2.16b, v17.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "smax v16.4s, v16.4s, v19.4s\n"
+    "srshl v22.4s, v22.4s, v9.4s\n"
+    "and v31.16b, v18.16b, v9.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "srshl v17.4s, v17.4s, v30.4s\n"
+    "sqadd v18.4s, v18.4s, v31.4s\n"
+    "smax v22.4s, v22.4s, v19.4s\n"
+    "uzp1 v16.16b, v16.16b, v22.16b\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "srshl v18.4s, v18.4s, v9.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x12, x7]\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "smax v17.4s, v17.4s, v19.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smax v18.4s, v18.4s, v19.4s\n"
+    "uzp1 v17.16b, v17.16b, v18.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "str d17, [x11, x7]\n"
+    "add x7, x7, #0x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x19, #0x0]\n"
+    "mov v20.16b, v15.16b\n"
+    "ldr q10, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v16.16b, v15.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v17.16b, v15.16b\n"
+    "ldr d0, [x6, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "mov v23.16b, v10.16b\n"
+    "ldr d1, [x6, #0x8]\n"
+    "mov v22.16b, v10.16b\n"
+    "ldr d2, [x6, #0x10]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "mov v18.16b, v10.16b\n"
+    "ldr d3, [x6, #0x18]\n"
+    "ldr d4, [x6, #0x20]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr d5, [x6, #0x28]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ldr d6, [x6, #0x30]\n"
+    "ldr d7, [x6, #0x38]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ldr d8, [x6, #0x40]\n"
+    "ssubl v5.8h, v5.8b, v13.8b\n"
+    "ldp x26, x25, [x8, #0x0]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ldp x24, x23, [x8, #0x10]\n"
+    "ssubl v7.8h, v7.8b, v13.8b\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldp x22, x21, [x8, #0x20]\n"
+    "ldp x20, x19, [x8, #0x30]\n"
+    "ldr d31, [x26, x5]\n"
+    "ssubl v31.8h, v31.8b, v12.8b\n"
+    "ldr d30, [x25, x5]\n"
+    "ldr d29, [x24, x5]\n"
+    "ssubl v30.8h, v30.8b, v12.8b\n"
+    "ldr d28, [x23, x5]\n"
+    "ldr d27, [x22, x5]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "ldr d26, [x21, x5]\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "ldr d25, [x20, x5]\n"
+    "ldr d24, [x19, x5]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v15.4s, v31.4h, v8.4h\n"
+    "ldr x23, [x8, #0x40]\n"
+    "tst x4, #0x7\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "ldr x22, [x8, #0x48]\n"
+    "smlal v20.4s, v31.4h, v6.4h\n"
+    "ldr x21, [x8, #0x50]\n"
+    "smlal2 v23.4s, v31.8h, v6.8h\n"
+    "ldr x20, [x8, #0x58]\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "ldr x19, [x8, #0x60]\n"
+    "smlal2 v22.4s, v31.8h, v2.8h\n"
+    "ldr x10, [x8, #0x68]\n"
+    "smlal v17.4s, v31.4h, v0.4h\n"
+    "ldr x9, [x8, #0x70]\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x8, #0x78]\n"
+    "smlal v15.4s, v30.4h, v0.4h\n"
+    "ldr x27, [x8, #0x80]\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "ldr x26, [x8, #0x88]\n"
+    "smlal v20.4s, v28.4h, v1.4h\n"
+    "ldr x25, [x8, #0x90]\n"
+    "smlal2 v23.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x5]\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v15.4s, v29.4h, v1.4h\n"
+    "ldr x24, [x8, #0x98]\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "ldr d29, [x23, x5]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v27.4h, v2.4h\n"
+    "ldr x23, [x8, #0xa0]\n"
+    "smlal2 v23.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x5]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v15.4s, v26.4h, v3.4h\n"
+    "ldr x22, [x8, #0xa8]\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x20, x5]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v4.4h\n"
+    "ldr x21, [x8, #0xb0]\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "ldr d25, [x19, x5]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "ldr x20, [x8, #0xb8]\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "ldr x19, [x8, #0xc0]\n"
+    "smlal v20.4s, v24.4h, v0.4h\n"
+    "ldr q21, [x17, #0x0]\n"
+    "smlal2 v23.4s, v24.8h, v0.8h\n"
+    "ldr d24, [x9, x5]\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v4.4h\n"
+    "ldr q30, [x15, #0x0]\n"
+    "smlal2 v23.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x10, x5]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v5.4h\n"
+    "ldr q31, [x17, #0x10]\n"
+    "smlal2 v23.4s, v28.8h, v5.8h\n"
+    "ldr d28, [x27, x5]\n"
+    "add x17, x17, #0x20\n"
+    "smlal v15.4s, v27.4h, v5.4h\n"
+    "ldr q9, [x15, #0x10]\n"
+    "add x15, x15, #0x20\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v20.4s, v27.4h, v3.4h\n"
+    "smlal2 v23.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x28, x5]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v16.4s, v26.4h, v3.4h\n"
+    "smlal2 v22.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x26, x5]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v6.4h\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v22.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x25, x5]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v16.4s, v29.4h, v4.4h\n"
+    "smlal2 v22.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x24, x5]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v7.4h\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v22.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x22, x5]\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v17.4s, v27.4h, v4.4h\n"
+    "smlal2 v18.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x5]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "smlal2 v23.4s, v28.8h, v7.8h\n"
+    "smlal v17.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "smlal v16.4s, v25.4h, v6.4h\n"
+    "smlal2 v22.4s, v25.8h, v6.8h\n"
+    "ldr d25, [x20, x5]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v17.4s, v26.4h, v5.4h\n"
+    "smlal2 v18.4s, v26.8h, v5.8h\n"
+    "ldr d26, [x21, x5]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "smlal2 v23.4s, v29.8h, v8.8h\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "smlal2 v18.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x19, x5]\n"
+    "add x5, x5, #0x8\n"
+    "smlal v16.4s, v27.4h, v7.4h\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal2 v22.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v24.4h, v3.4h\n"
+    "smlal v16.4s, v24.4h, v5.4h\n"
+    "smlal2 v18.4s, v24.8h, v3.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+    "smlal2 v22.4s, v24.8h, v5.8h\n"
+    "smlal v17.4s, v26.4h, v7.4h\n"
+    "smlal2 v18.4s, v26.8h, v7.8h\n"
+    "smlal v16.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "smlal v17.4s, v25.4h, v6.4h\n"
+    "smlal2 v18.4s, v25.8h, v6.8h\n"
+    "and v26.16b, v15.16b, v30.16b\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "smlal v17.4s, v29.4h, v8.4h\n"
+    "smlal2 v18.4s, v29.8h, v8.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+    "sqadd v15.4s, v15.4s, v26.4s\n"
+    "and v8.16b, v10.16b, v9.16b\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "srshl v15.4s, v15.4s, v30.4s\n"
+    "and v4.16b, v20.16b, v30.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v2.16b, v23.16b, v9.16b\n"
+    "and v1.16b, v16.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v11.4s\n"
+    "sqadd v10.4s, v10.4s, v8.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+    "sqadd v20.4s, v20.4s, v4.4s\n"
+    "smin v15.4s, v15.4s, v14.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "smax v15.4s, v15.4s, v19.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "add v10.4s, v10.4s, v11.4s\n"
+    "srshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v1.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "smax v10.4s, v10.4s, v19.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "uzp1 v15.16b, v15.16b, v10.16b\n"
+    "smax v20.4s, v20.4s, v19.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x14, x7]\n"
+    "smax v23.4s, v23.4s, v19.4s\n"
+    "srshl v16.4s, v16.4s, v30.4s\n"
+    "and v24.16b, v22.16b, v9.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "uzp1 v20.16b, v20.16b, v23.16b\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d20, [x13, x7]\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "sqadd v22.4s, v22.4s, v24.4s\n"
+    "and v2.16b, v17.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "smax v16.4s, v16.4s, v19.4s\n"
+    "srshl v22.4s, v22.4s, v9.4s\n"
+    "and v31.16b, v18.16b, v9.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "srshl v17.4s, v17.4s, v30.4s\n"
+    "sqadd v18.4s, v18.4s, v31.4s\n"
+    "smax v22.4s, v22.4s, v19.4s\n"
+    "uzp1 v16.16b, v16.16b, v22.16b\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "srshl v18.4s, v18.4s, v9.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x12, x7]\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "smax v17.4s, v17.4s, v19.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smax v18.4s, v18.4s, v19.4s\n"
+    "uzp1 v17.16b, v17.16b, v18.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "str d17, [x11, x7]\n"
+    "add x7, x7, #0x8\n"
+    "beq 88f\n"
+    "add x6, x6, #0x48\n"
+    "3:"  // Oddments
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x4, #2, 5f\n"
+    "ld1 { v15.4s }, [x19], #0x10\n"
+    "tbz x4, #1, 4f\n"
+    "ld1 { v10.d }[0], [x19], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v10.s }[0], [x19]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x4, #1, 6f\n"
+    "ld1 { v15.d }[0], [x19], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[2], [x19]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[0], [x19]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v20.16b, v15.16b\n"
+    "ldr d0, [x6, #0x0]\n"
+    "mov v23.16b, v10.16b\n"
+    "ldr d1, [x6, #0x8]\n"
+    "mov v16.16b, v15.16b\n"
+    "ldr d2, [x6, #0x10]\n"
+    "mov v22.16b, v10.16b\n"
+    "ldr d3, [x6, #0x18]\n"
+    "mov v17.16b, v15.16b\n"
+    "ldr d4, [x6, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "mov v18.16b, v10.16b\n"
+    "ldr d5, [x6, #0x28]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldr d6, [x6, #0x30]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldr d7, [x6, #0x38]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ldr d8, [x6, #0x40]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ldp x26, x25, [x8, #0x0]\n"
+    "ssubl v5.8h, v5.8b, v13.8b\n"
+    "ldp x24, x23, [x8, #0x10]\n"
+    "ssubl v6.8h, v6.8b, v13.8b\n"
+    "ssubl v7.8h, v7.8b, v13.8b\n"
+    "ldp x22, x21, [x8, #0x20]\n"
+    "ssubl v8.8h, v8.8b, v13.8b\n"
+    "ldp x20, x19, [x8, #0x30]\n"
+    "add x26, x26, x5\n"
+    "add x25, x25, x5\n"
+    "add x24, x24, x5\n"
+    "add x23, x23, x5\n"
+    "add x22, x22, x5\n"
+    "add x21, x21, x5\n"
+    "add x20, x20, x5\n"
+    "add x19, x19, x5\n"
+    "tbz x4, #2, 9f\n"
+    "ld1 { v31.s }[0], [x26], #0x4\n"
+    "ld1 { v30.s }[0], [x25], #0x4\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "ld1 { v28.s }[0], [x23], #0x4\n"
+    "ld1 { v27.s }[0], [x22], #0x4\n"
+    "ld1 { v26.s }[0], [x21], #0x4\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "ld1 { v24.s }[0], [x19], #0x4\n"
+    "tbz x4, #1, 8f\n"
+    "ld1 { v31.h }[2], [x26], #0x2\n"
+    "ld1 { v30.h }[2], [x25], #0x2\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "ld1 { v28.h }[2], [x23], #0x2\n"
+    "ld1 { v27.h }[2], [x22], #0x2\n"
+    "ld1 { v26.h }[2], [x21], #0x2\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "ld1 { v24.h }[2], [x19], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[6], [x26]\n"
+    "ld1 { v30.b }[6], [x25]\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "ld1 { v28.b }[6], [x23]\n"
+    "ld1 { v27.b }[6], [x22]\n"
+    "ld1 { v26.b }[6], [x21]\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "ld1 { v24.b }[6], [x19]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[4], [x26]\n"
+    "ld1 { v30.b }[4], [x25]\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "ld1 { v28.b }[4], [x23]\n"
+    "ld1 { v27.b }[4], [x22]\n"
+    "ld1 { v26.b }[4], [x21]\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "ld1 { v24.b }[4], [x19]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x4, #1, 10f\n"
+    "ld1 { v31.h }[0], [x26], #0x2\n"
+    "ld1 { v30.h }[0], [x25], #0x2\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "ld1 { v28.h }[0], [x23], #0x2\n"
+    "ld1 { v27.h }[0], [x22], #0x2\n"
+    "ld1 { v26.h }[0], [x21], #0x2\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "ld1 { v24.h }[0], [x19], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[2], [x26]\n"
+    "ld1 { v30.b }[2], [x25]\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "ld1 { v28.b }[2], [x23]\n"
+    "ld1 { v27.b }[2], [x22]\n"
+    "ld1 { v26.b }[2], [x21]\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "ld1 { v24.b }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[0], [x26]\n"
+    "ld1 { v30.b }[0], [x25]\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "ld1 { v28.b }[0], [x23]\n"
+    "ld1 { v27.b }[0], [x22]\n"
+    "ld1 { v26.b }[0], [x21]\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "ld1 { v24.b }[0], [x19]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ldr x23, [x8, #0x40]\n"
+    "ssubl v31.8h, v31.8b, v12.8b\n"
+    "smlal v15.4s, v31.4h, v8.4h\n"
+    "ssubl v30.8h, v30.8b, v12.8b\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v31.4h, v6.4h\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "smlal2 v23.4s, v31.8h, v6.8h\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal2 v22.4s, v31.8h, v2.8h\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v17.4s, v31.4h, v0.4h\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "add x23, x23, x5\n"
+    "smlal v15.4s, v30.4h, v0.4h\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "smlal v20.4s, v28.4h, v1.4h\n"
+    "smlal2 v23.4s, v28.8h, v1.8h\n"
+    "smlal v15.4s, v29.4h, v1.4h\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "smlal v20.4s, v27.4h, v2.4h\n"
+    "smlal2 v23.4s, v27.8h, v2.8h\n"
+    "smlal v15.4s, v26.4h, v3.4h\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "smlal v20.4s, v24.4h, v0.4h\n"
+    "smlal2 v23.4s, v24.8h, v0.8h\n"
+    "smlal v15.4s, v25.4h, v4.4h\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "tbz x4, #2, 13f\n"
+    "ld1 { v29.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 12f\n"
+    "ld1 { v29.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v29.b }[6], [x23]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v29.b }[4], [x23]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x4, #1, 14f\n"
+    "ld1 { v29.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v29.b }[2], [x23]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v29.b }[0], [x23]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ldr x22, [x8, #0x48]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v4.4h\n"
+    "smlal2 v23.4s, v29.8h, v4.8h\n"
+    "add x22, x22, x5\n"
+    "tbz x4, #2, 17f\n"
+    "ld1 { v28.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 16f\n"
+    "ld1 { v28.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v28.b }[6], [x22]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v28.b }[4], [x22]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x4, #1, 18f\n"
+    "ld1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v28.b }[2], [x22]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v28.b }[0], [x22]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ldr x21, [x8, #0x50]\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v5.4h\n"
+    "smlal2 v23.4s, v28.8h, v5.8h\n"
+    "add x21, x21, x5\n"
+    "tbz x4, #2, 21f\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 20f\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x4, #1, 22f\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "ldr x20, [x8, #0x58]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v15.4s, v27.4h, v5.4h\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "add x20, x20, x5\n"
+    "smlal v20.4s, v27.4h, v3.4h\n"
+    "smlal2 v23.4s, v27.8h, v3.8h\n"
+    "tbz x4, #2, 25f\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 24f\n"
+    "ld1 { v26.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v26.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v26.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x4, #1, 26f\n"
+    "ld1 { v26.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v26.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v26.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "ldr x19, [x8, #0x60]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v16.4s, v26.4h, v3.4h\n"
+    "smlal2 v22.4s, v26.8h, v3.8h\n"
+    "add x19, x19, x5\n"
+    "tbz x4, #2, 29f\n"
+    "ld1 { v25.s }[0], [x19], #0x4\n"
+    "tbz x4, #1, 28f\n"
+    "ld1 { v25.h }[2], [x19], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v25.b }[6], [x19]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v25.b }[4], [x19]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x4, #1, 30f\n"
+    "ld1 { v25.h }[0], [x19], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v25.b }[2], [x19]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v25.b }[0], [x19]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "ldr x10, [x8, #0x68]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v6.4h\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "add x10, x10, x5\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v22.4s, v25.8h, v0.8h\n"
+    "tbz x4, #2, 33f\n"
+    "ld1 { v29.s }[0], [x10], #0x4\n"
+    "tbz x4, #1, 32f\n"
+    "ld1 { v29.h }[2], [x10], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v29.b }[6], [x10]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v29.b }[4], [x10]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x4, #1, 34f\n"
+    "ld1 { v29.h }[0], [x10], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v29.b }[2], [x10]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v29.b }[0], [x10]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr x9, [x8, #0x70]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v16.4s, v29.4h, v4.4h\n"
+    "smlal2 v22.4s, v29.8h, v4.8h\n"
+    "add x9, x9, x5\n"
+    "tbz x4, #2, 37f\n"
+    "ld1 { v24.s }[0], [x9], #0x4\n"
+    "tbz x4, #1, 36f\n"
+    "ld1 { v24.h }[2], [x9], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v24.b }[6], [x9]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v24.b }[4], [x9]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x4, #1, 38f\n"
+    "ld1 { v24.h }[0], [x9], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v24.b }[2], [x9]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v24.b }[0], [x9]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr x28, [x8, #0x78]\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v7.4h\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "add x28, x28, x5\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v22.4s, v24.8h, v1.8h\n"
+    "tbz x4, #2, 41f\n"
+    "ld1 { v27.s }[0], [x28], #0x4\n"
+    "tbz x4, #1, 40f\n"
+    "ld1 { v27.h }[2], [x28], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v27.b }[6], [x28]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v27.b }[4], [x28]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x4, #1, 42f\n"
+    "ld1 { v27.h }[0], [x28], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v27.b }[2], [x28]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v27.b }[0], [x28]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr x27, [x8, #0x80]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v17.4s, v27.4h, v4.4h\n"
+    "smlal2 v18.4s, v27.8h, v4.8h\n"
+    "add x27, x27, x5\n"
+    "tbz x4, #2, 45f\n"
+    "ld1 { v28.s }[0], [x27], #0x4\n"
+    "tbz x4, #1, 44f\n"
+    "ld1 { v28.h }[2], [x27], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v28.b }[6], [x27]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v28.b }[4], [x27]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x4, #1, 46f\n"
+    "ld1 { v28.h }[0], [x27], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v28.b }[2], [x27]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v28.b }[0], [x27]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr x26, [x8, #0x88]\n"
+    "ssubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "smlal2 v23.4s, v28.8h, v7.8h\n"
+    "add x26, x26, x5\n"
+    "smlal v17.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "tbz x4, #2, 49f\n"
+    "ld1 { v26.s }[0], [x26], #0x4\n"
+    "tbz x4, #1, 48f\n"
+    "ld1 { v26.h }[2], [x26], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v26.b }[6], [x26]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v26.b }[4], [x26]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x4, #1, 50f\n"
+    "ld1 { v26.h }[0], [x26], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v26.b }[2], [x26]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v26.b }[0], [x26]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr x25, [x8, #0x90]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v17.4s, v26.4h, v5.4h\n"
+    "smlal2 v18.4s, v26.8h, v5.8h\n"
+    "add x25, x25, x5\n"
+    "tbz x4, #2, 53f\n"
+    "ld1 { v25.s }[0], [x25], #0x4\n"
+    "tbz x4, #1, 52f\n"
+    "ld1 { v25.h }[2], [x25], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v25.b }[6], [x25]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v25.b }[4], [x25]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x4, #1, 54f\n"
+    "ld1 { v25.h }[0], [x25], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v25.b }[2], [x25]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v25.b }[0], [x25]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "ldr x24, [x8, #0x98]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v16.4s, v25.4h, v6.4h\n"
+    "smlal2 v22.4s, v25.8h, v6.8h\n"
+    "add x24, x24, x5\n"
+    "tbz x4, #2, 57f\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "tbz x4, #1, 56f\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x4, #1, 58f\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr x23, [x8, #0xa0]\n"
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "smlal2 v23.4s, v29.8h, v8.8h\n"
+    "add x23, x23, x5\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "smlal2 v18.4s, v29.8h, v2.8h\n"
+    "tbz x4, #2, 61f\n"
+    "ld1 { v27.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 60f\n"
+    "ld1 { v27.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v27.b }[6], [x23]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v27.b }[4], [x23]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x4, #1, 62f\n"
+    "ld1 { v27.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v27.b }[2], [x23]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v27.b }[0], [x23]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr x22, [x8, #0xa8]\n"
+    "ssubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v16.4s, v27.4h, v7.4h\n"
+    "smlal2 v22.4s, v27.8h, v7.8h\n"
+    "add x22, x22, x5\n"
+    "tbz x4, #2, 65f\n"
+    "ld1 { v24.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 64f\n"
+    "ld1 { v24.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v24.b }[6], [x22]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v24.b }[4], [x22]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x4, #1, 66f\n"
+    "ld1 { v24.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v24.b }[2], [x22]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v24.b }[0], [x22]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr x21, [x8, #0xb0]\n"
+    "ssubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v16.4s, v24.4h, v5.4h\n"
+    "smlal2 v22.4s, v24.8h, v5.8h\n"
+    "add x21, x21, x5\n"
+    "smlal v17.4s, v24.4h, v3.4h\n"
+    "smlal2 v18.4s, v24.8h, v3.8h\n"
+    "tbz x4, #2, 69f\n"
+    "ld1 { v26.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 68f\n"
+    "ld1 { v26.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v26.b }[6], [x21]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v26.b }[4], [x21]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x4, #1, 70f\n"
+    "ld1 { v26.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v26.b }[2], [x21]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v26.b }[0], [x21]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr x20, [x8, #0xb8]\n"
+    "ssubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v17.4s, v26.4h, v7.4h\n"
+    "smlal2 v18.4s, v26.8h, v7.8h\n"
+    "add x20, x20, x5\n"
+    "tbz x4, #2, 73f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 72f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x4, #1, 74f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr x19, [x8, #0xc0]\n"
+    "ssubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v16.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "add x19, x19, x5\n"
+    "smlal v17.4s, v25.4h, v6.4h\n"
+    "smlal2 v18.4s, v25.8h, v6.8h\n"
+    "tbz x4, #2, 77f\n"
+    "ld1 { v29.s }[0], [x19], #0x4\n"
+    "tbz x4, #1, 76f\n"
+    "ld1 { v29.h }[2], [x19], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v29.b }[6], [x19]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v29.b }[4], [x19]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x4, #1, 78f\n"
+    "ld1 { v29.h }[0], [x19], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v29.b }[2], [x19]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v29.b }[0], [x19]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "ssubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v17.4s, v29.4h, v8.4h\n"
+    "smlal2 v18.4s, v29.8h, v8.8h\n"
+    "tbz x4, #2, 81f\n"
+    "ld1 { v21.4s }, [x17], #0x10\n"
+    "ld1 { v30.4s }, [x15], #0x10\n"
+    "tbz x4, #1, 80f\n"
+    "ld1 { v31.d }[0], [x17], #0x8\n"
+    "ld1 { v9.d }[0], [x15], #0x8\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v31.s }[2], [x17]\n"
+    "ld1 { v9.s }[2], [x15]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v31.s }[0], [x17]\n"
+    "ld1 { v9.s }[0], [x15]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x4, #1, 82f\n"
+    "ld1 { v21.d }[0], [x17], #0x8\n"
+    "ld1 { v30.d }[0], [x15], #0x8\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v21.s }[2], [x17]\n"
+    "ld1 { v30.s }[2], [x15]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v21.s }[0], [x17]\n"
+    "ld1 { v30.s }[0], [x15]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+    "add x14, x14, x7\n"
+    "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+    "add x13, x13, x7\n"
+    "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+    "add x12, x12, x7\n"
+    "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+    "add x11, x11, x7\n"
+    "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+    "and v26.16b, v15.16b, v30.16b\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "and v8.16b, v10.16b, v9.16b\n"
+    "and v4.16b, v20.16b, v30.16b\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "and v2.16b, v23.16b, v9.16b\n"
+    "and v1.16b, v16.16b, v30.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v26.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "sqadd v10.4s, v10.4s, v8.4s\n"
+    "sqadd v20.4s, v20.4s, v4.4s\n"
+    "srshl v15.4s, v15.4s, v30.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "add v15.4s, v15.4s, v11.4s\n"
+    "srshl v23.4s, v23.4s, v9.4s\n"
+    "add v10.4s, v10.4s, v11.4s\n"
+    "smin v15.4s, v15.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "smax v15.4s, v15.4s, v19.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smax v10.4s, v10.4s, v19.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "smax v20.4s, v20.4s, v19.4s\n"
+    "uzp1 v15.16b, v15.16b, v10.16b\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "sqadd v16.4s, v16.4s, v1.4s\n"
+    "smax v23.4s, v23.4s, v19.4s\n"
+    "and v24.16b, v22.16b, v9.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "uzp1 v20.16b, v20.16b, v23.16b\n"
+    "srshl v16.4s, v16.4s, v30.4s\n"
+    "and v2.16b, v17.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "sqadd v22.4s, v22.4s, v24.4s\n"
+    "and v31.16b, v18.16b, v9.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "srshl v22.4s, v22.4s, v9.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "smax v16.4s, v16.4s, v19.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "srshl v17.4s, v17.4s, v30.4s\n"
+    "sqadd v18.4s, v18.4s, v31.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "srshl v18.4s, v18.4s, v9.4s\n"
+    "smax v22.4s, v22.4s, v19.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v22.16b\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "smax v17.4s, v17.4s, v19.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smax v18.4s, v18.4s, v19.4s\n"
+    "uzp1 v17.16b, v17.16b, v18.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "tbz x4, #2, 85f\n"
+    "st1 { v15.s }[0], [x14], #0x4\n"
+    "st1 { v20.s }[0], [x13], #0x4\n"
+    "st1 { v16.s }[0], [x12], #0x4\n"
+    "st1 { v17.s }[0], [x11], #0x4\n"
+    "tbz x4, #1, 84f\n"
+    "st1 { v15.h }[2], [x14], #0x2\n"
+    "st1 { v20.h }[2], [x13], #0x2\n"
+    "st1 { v16.h }[2], [x12], #0x2\n"
+    "st1 { v17.h }[2], [x11], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "st1 { v15.b }[6], [x14], #0x1\n"
+    "st1 { v20.b }[6], [x13], #0x1\n"
+    "st1 { v16.b }[6], [x12], #0x1\n"
+    "st1 { v17.b }[6], [x11], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "st1 { v15.b }[4], [x14], #0x1\n"
+    "st1 { v20.b }[4], [x13], #0x1\n"
+    "st1 { v16.b }[4], [x12], #0x1\n"
+    "st1 { v17.b }[4], [x11], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x4, #1, 86f\n"
+    "st1 { v15.h }[0], [x14], #0x2\n"
+    "st1 { v20.h }[0], [x13], #0x2\n"
+    "st1 { v16.h }[0], [x12], #0x2\n"
+    "st1 { v17.h }[0], [x11], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "st1 { v15.b }[2], [x14], #0x1\n"
+    "st1 { v20.b }[2], [x13], #0x1\n"
+    "st1 { v16.b }[2], [x12], #0x1\n"
+    "st1 { v17.b }[2], [x11], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "st1 { v15.b }[0], [x14], #0x1\n"
+    "st1 { v20.b }[0], [x13], #0x1\n"
+    "st1 { v16.b }[0], [x12], #0x1\n"
+    "st1 { v17.b }[0], [x11], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+
+    "88:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..a998fa1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size;
+
+  kern_type kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+  a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..ab64f53
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,2213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x10, #0x0\n"
+    "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x1, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x25, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x19, x4, #0x3\n"
+    "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v7.16b }, [x13]\n"
+    "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v19.4s }, [x8]\n"
+    "add x8, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "ld1r { v12.4s }, [x8]\n"
+    "ldp x17, x16, [x21, #0x0]\n"
+    "ldp x6, x8, [x21, #0x10]\n"
+    "cbz x19, 3f\n"
+    "subs x19, x19, #0x1\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x12, #0x0]\n"
+    "mov v18.16b, v15.16b\n"
+    "ldr q20, [x12, #0x10]\n"
+    "add x12, x12, #0x20\n"
+    "mov v11.16b, v15.16b\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v10.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "mov v5.16b, v20.16b\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v8.16b, v20.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "mov v9.16b, v20.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "ldr d4, [x3, #0x20]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "ldr d31, [x28, x10]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "ldr d30, [x27, x10]\n"
+    "ldr d29, [x26, x10]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "ldr d28, [x13, x10]\n"
+    "ldr d27, [x24, x10]\n"
+    "ssubl v29.8h, v29.8b, v7.8b\n"
+    "ldr d23, [x23, x10]\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "ldr d25, [x22, x10]\n"
+    "ldr d24, [x21, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "ldr d26, [x20, x10]\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "ldr d22, [x0, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "ssubl v22.8h, v22.8b, v7.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "ldr x20, [x25, #0x50]\n"
+    "subs x19, x19, #0x1\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x25, #0x58]\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "ldr x0, [x25, #0x60]\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "ldr d31, [x20, x10]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v11.4s, v29.4h, v0.4h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "smlal2 v8.4s, v29.8h, v0.8h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "smlal v10.4s, v28.4h, v0.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "ldr d0, [x3, #0x28]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v1.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "ldr x21, [x25, #0x98]\n"
+    "smlal2 v8.4s, v28.8h, v1.8h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "smlal v10.4s, v23.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "ldr d1, [x3, #0x30]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x0, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "smlal v11.4s, v23.4h, v2.4h\n"
+    "ldr x9, [x25, #0xc8]\n"
+    "smlal2 v8.4s, v23.8h, v2.8h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "smlal v10.4s, v31.4h, v2.4h\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "smlal2 v9.4s, v31.8h, v2.8h\n"
+    "ldr d2, [x3, #0x38]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "ldr q6, [x2, #0x0]\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x7, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "ldr q21, [x5, #0x0]\n"
+    "smlal v11.4s, v31.4h, v3.4h\n"
+    "ldr q17, [x2, #0x10]\n"
+    "add x2, x2, #0x20\n"
+    "smlal2 v8.4s, v31.8h, v3.8h\n"
+    "ldr q14, [x5, #0x10]\n"
+    "add x5, x5, #0x20\n"
+    "smlal v10.4s, v30.4h, v3.4h\n"
+    "smlal2 v9.4s, v30.8h, v3.8h\n"
+    "ldr d3, [x3, #0x40]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x26, x10]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v11.4s, v30.4h, v4.4h\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "smlal2 v8.4s, v30.8h, v4.8h\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0x48]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v20.4s, v29.8h, v0.8h\n"
+    "smlal v18.4s, v28.4h, v0.4h\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "smlal v11.4s, v22.4h, v0.4h\n"
+    "smlal2 v8.4s, v22.8h, v0.8h\n"
+    "smlal v10.4s, v25.4h, v0.4h\n"
+    "smlal2 v9.4s, v25.8h, v0.8h\n"
+    "ldr d0, [x3, #0x50]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x10]\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v1.4h\n"
+    "ldr x23, [x25, #0xf8]\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "smlal v11.4s, v25.4h, v1.4h\n"
+    "smlal2 v8.4s, v25.8h, v1.8h\n"
+    "smlal v10.4s, v24.4h, v1.4h\n"
+    "smlal2 v9.4s, v24.8h, v1.8h\n"
+    "ldr d1, [x3, #0x58]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v20.4s, v23.8h, v2.8h\n"
+    "ldr d23, [x20, x10]\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v18.4s, v31.4h, v2.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "smlal v11.4s, v24.4h, v2.4h\n"
+    "smlal2 v8.4s, v24.8h, v2.8h\n"
+    "smlal v10.4s, v27.4h, v2.4h\n"
+    "smlal2 v9.4s, v27.8h, v2.8h\n"
+    "ldr d2, [x3, #0x60]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v20.4s, v31.8h, v3.8h\n"
+    "ldr d31, [x13, x10]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v3.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "smlal v11.4s, v27.4h, v3.4h\n"
+    "smlal2 v8.4s, v27.8h, v3.8h\n"
+    "smlal v10.4s, v23.4h, v3.4h\n"
+    "smlal2 v9.4s, v23.8h, v3.8h\n"
+    "ldr d3, [x3, #0x68]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x10]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ldr d26, [x14, x10]\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v11.4s, v23.4h, v4.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "smlal v10.4s, v28.4h, v4.4h\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "ldr d4, [x3, #0x70]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v20.4s, v22.8h, v0.8h\n"
+    "ldr d22, [x0, x10]\n"
+    "ssubl v22.8h, v22.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v0.4h\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "smlal v11.4s, v31.4h, v0.4h\n"
+    "smlal2 v8.4s, v31.8h, v0.8h\n"
+    "smlal v10.4s, v30.4h, v0.4h\n"
+    "smlal2 v9.4s, v30.8h, v0.8h\n"
+    "ldr d0, [x3, #0x78]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v20.4s, v25.8h, v1.8h\n"
+    "ldr d25, [x11, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v1.4h\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "smlal v11.4s, v30.4h, v1.4h\n"
+    "smlal2 v8.4s, v30.8h, v1.8h\n"
+    "smlal v10.4s, v26.4h, v1.4h\n"
+    "smlal2 v9.4s, v26.8h, v1.8h\n"
+    "ldr d1, [x3, #0x80]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v20.4s, v24.8h, v2.8h\n"
+    "ldr d24, [x24, x10]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v2.4h\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "smlal v11.4s, v26.4h, v2.4h\n"
+    "smlal2 v8.4s, v26.8h, v2.8h\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "ldr d2, [x3, #0x88]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x15, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v3.4h\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "ldr d3, [x3, #0x90]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v20.4s, v23.8h, v4.8h\n"
+    "ldr d23, [x9, x10]\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v18.4s, v28.4h, v4.4h\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "ldr d28, [x12, x10]\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "smlal v10.4s, v22.4h, v4.4h\n"
+    "smlal2 v9.4s, v22.8h, v4.8h\n"
+    "ldr d4, [x3, #0x98]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x27, x10]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "smlal v11.4s, v27.4h, v0.4h\n"
+    "smlal2 v8.4s, v27.8h, v0.8h\n"
+    "smlal v10.4s, v23.4h, v0.4h\n"
+    "smlal2 v9.4s, v23.8h, v0.8h\n"
+    "ldr d0, [x3, #0xa0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v26.4h, v1.4h\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "smlal v11.4s, v23.4h, v1.4h\n"
+    "smlal2 v8.4s, v23.8h, v1.8h\n"
+    "smlal v10.4s, v31.4h, v1.4h\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "ldr d1, [x3, #0xa8]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v20.4s, v26.8h, v2.8h\n"
+    "ldr d26, [x7, x10]\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v8.4s, v31.8h, v2.8h\n"
+    "smlal v10.4s, v30.4h, v2.4h\n"
+    "smlal2 v9.4s, v30.8h, v2.8h\n"
+    "ldr d2, [x3, #0xb0]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x26, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v11.4s, v30.4h, v3.4h\n"
+    "smlal2 v8.4s, v30.8h, v3.8h\n"
+    "smlal v10.4s, v28.4h, v3.4h\n"
+    "smlal2 v9.4s, v28.8h, v3.8h\n"
+    "ldr d3, [x3, #0xb8]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x23, x10]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v22.4h, v4.4h\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "smlal v11.4s, v28.4h, v4.4h\n"
+    "smlal2 v8.4s, v28.8h, v4.8h\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0xc0]\n"
+    "add x3, x3, #0xc8\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v20.4s, v27.8h, v0.8h\n"
+    "ldr d27, [x22, x10]\n"
+    "smlal v18.4s, v23.4h, v0.4h\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v8.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x20, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v0.4h\n"
+    "smlal2 v9.4s, v24.8h, v0.8h\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v20.4s, v23.8h, v1.8h\n"
+    "smlal v18.4s, v31.4h, v1.4h\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v8.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x13, x10]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v1.4h\n"
+    "smlal2 v9.4s, v27.8h, v1.8h\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v20.4s, v31.8h, v2.8h\n"
+    "smlal v18.4s, v30.4h, v2.4h\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "smlal v11.4s, v27.4h, v2.4h\n"
+    "smlal2 v8.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x10]\n"
+    "add x10, x10, #0x8\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v20.4s, v30.8h, v3.8h\n"
+    "smlal v18.4s, v28.4h, v3.4h\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v20.4s, v28.8h, v4.8h\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "smlal v10.4s, v27.4h, v4.4h\n"
+    "smlal2 v9.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+    "and v1.16b, v15.16b, v21.16b\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "and v29.16b, v20.16b, v14.16b\n"
+    "and v3.16b, v18.16b, v21.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v2.16b, v5.16b, v14.16b\n"
+    "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v1.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+    "and v0.16b, v11.16b, v21.16b\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v15.4s, v15.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v29.4s\n"
+    "sqadd v18.4s, v18.4s, v3.4s\n"
+    "sqadd v5.4s, v5.4s, v2.4s\n"
+    "and v27.16b, v8.16b, v14.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v19.4s\n"
+    "srshl v20.4s, v20.4s, v14.4s\n"
+    "srshl v18.4s, v18.4s, v21.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "smin v15.4s, v15.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v19.4s\n"
+    "add v18.4s, v18.4s, v19.4s\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "add v5.4s, v5.4s, v19.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v5.4s, v5.4s, v12.4s\n"
+    "uzp1 v15.16b, v15.16b, v20.16b\n"
+    "sqadd v11.4s, v11.4s, v0.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x17, x1]\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "sqadd v8.4s, v8.4s, v27.4s\n"
+    "srshl v11.4s, v11.4s, v21.4s\n"
+    "and v30.16b, v10.16b, v21.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "uzp1 v18.16b, v18.16b, v5.16b\n"
+    "add v11.4s, v11.4s, v19.4s\n"
+    "srshl v8.4s, v8.4s, v14.4s\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str d18, [x16, x1]\n"
+    "smin v11.4s, v11.4s, v12.4s\n"
+    "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+    "add v8.4s, v8.4s, v19.4s\n"
+    "sqadd v10.4s, v10.4s, v30.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "smin v8.4s, v8.4s, v12.4s\n"
+    "and v6.16b, v9.16b, v14.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "srshl v10.4s, v10.4s, v21.4s\n"
+    "uzp1 v11.16b, v11.16b, v8.16b\n"
+    "add v10.4s, v10.4s, v19.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str d11, [x6, x1]\n"
+    "smin v10.4s, v10.4s, v12.4s\n"
+    "sqadd v9.4s, v9.4s, v6.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "srshl v9.4s, v9.4s, v14.4s\n"
+    "add v9.4s, v9.4s, v19.4s\n"
+    "smin v9.4s, v9.4s, v12.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "uzp1 v10.16b, v10.16b, v9.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d10, [x8, x1]\n"
+    "add x1, x1, #0x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x12, #0x0]\n"
+    "mov v18.16b, v15.16b\n"
+    "ldr q20, [x12, #0x10]\n"
+    "add x12, x12, #0x20\n"
+    "mov v11.16b, v15.16b\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v10.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "mov v5.16b, v20.16b\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v8.16b, v20.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "mov v9.16b, v20.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "ldr d4, [x3, #0x20]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "ldr d31, [x28, x10]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "ldr d30, [x27, x10]\n"
+    "ldr d29, [x26, x10]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "ldr d28, [x13, x10]\n"
+    "ldr d27, [x24, x10]\n"
+    "ssubl v29.8h, v29.8b, v7.8b\n"
+    "ldr d23, [x23, x10]\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "ldr d25, [x22, x10]\n"
+    "ldr d24, [x21, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "ldr d26, [x20, x10]\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "ldr d22, [x0, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "ssubl v22.8h, v22.8b, v7.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "ldr x20, [x25, #0x50]\n"
+    "tst x4, #0x7\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x25, #0x58]\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "ldr x0, [x25, #0x60]\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "ldr d31, [x20, x10]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v11.4s, v29.4h, v0.4h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "smlal2 v8.4s, v29.8h, v0.8h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "smlal v10.4s, v28.4h, v0.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "ldr d0, [x3, #0x28]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v1.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "ldr x21, [x25, #0x98]\n"
+    "smlal2 v8.4s, v28.8h, v1.8h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "smlal v10.4s, v23.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "ldr d1, [x3, #0x30]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x0, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "smlal v11.4s, v23.4h, v2.4h\n"
+    "ldr x9, [x25, #0xc8]\n"
+    "smlal2 v8.4s, v23.8h, v2.8h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "smlal v10.4s, v31.4h, v2.4h\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "smlal2 v9.4s, v31.8h, v2.8h\n"
+    "ldr d2, [x3, #0x38]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x7, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "ldr q6, [x2, #0x0]\n"
+    "smlal v11.4s, v31.4h, v3.4h\n"
+    "ldr q21, [x5, #0x0]\n"
+    "smlal2 v8.4s, v31.8h, v3.8h\n"
+    "ldr q17, [x2, #0x10]\n"
+    "add x2, x2, #0x20\n"
+    "smlal v10.4s, v30.4h, v3.4h\n"
+    "ldr q14, [x5, #0x10]\n"
+    "add x5, x5, #0x20\n"
+    "smlal2 v9.4s, v30.8h, v3.8h\n"
+    "ldr d3, [x3, #0x40]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x26, x10]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v11.4s, v30.4h, v4.4h\n"
+    "ldr x23, [x25, #0xf8]\n"
+    "smlal2 v8.4s, v30.8h, v4.8h\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0x48]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v20.4s, v29.8h, v0.8h\n"
+    "smlal v18.4s, v28.4h, v0.4h\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "smlal v11.4s, v22.4h, v0.4h\n"
+    "smlal2 v8.4s, v22.8h, v0.8h\n"
+    "smlal v10.4s, v25.4h, v0.4h\n"
+    "smlal2 v9.4s, v25.8h, v0.8h\n"
+    "ldr d0, [x3, #0x50]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x10]\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v1.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "smlal v11.4s, v25.4h, v1.4h\n"
+    "smlal2 v8.4s, v25.8h, v1.8h\n"
+    "smlal v10.4s, v24.4h, v1.4h\n"
+    "smlal2 v9.4s, v24.8h, v1.8h\n"
+    "ldr d1, [x3, #0x58]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v20.4s, v23.8h, v2.8h\n"
+    "ldr d23, [x20, x10]\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v18.4s, v31.4h, v2.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "smlal v11.4s, v24.4h, v2.4h\n"
+    "smlal2 v8.4s, v24.8h, v2.8h\n"
+    "smlal v10.4s, v27.4h, v2.4h\n"
+    "smlal2 v9.4s, v27.8h, v2.8h\n"
+    "ldr d2, [x3, #0x60]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v20.4s, v31.8h, v3.8h\n"
+    "ldr d31, [x13, x10]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v3.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "smlal v11.4s, v27.4h, v3.4h\n"
+    "smlal2 v8.4s, v27.8h, v3.8h\n"
+    "smlal v10.4s, v23.4h, v3.4h\n"
+    "smlal2 v9.4s, v23.8h, v3.8h\n"
+    "ldr d3, [x3, #0x68]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x10]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ldr d26, [x14, x10]\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v11.4s, v23.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "smlal v10.4s, v28.4h, v4.4h\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "ldr d4, [x3, #0x70]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v20.4s, v22.8h, v0.8h\n"
+    "ldr d22, [x0, x10]\n"
+    "ssubl v22.8h, v22.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v0.4h\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "smlal v11.4s, v31.4h, v0.4h\n"
+    "smlal2 v8.4s, v31.8h, v0.8h\n"
+    "smlal v10.4s, v30.4h, v0.4h\n"
+    "smlal2 v9.4s, v30.8h, v0.8h\n"
+    "ldr d0, [x3, #0x78]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v20.4s, v25.8h, v1.8h\n"
+    "ldr d25, [x11, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v1.4h\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "smlal v11.4s, v30.4h, v1.4h\n"
+    "smlal2 v8.4s, v30.8h, v1.8h\n"
+    "smlal v10.4s, v26.4h, v1.4h\n"
+    "smlal2 v9.4s, v26.8h, v1.8h\n"
+    "ldr d1, [x3, #0x80]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v20.4s, v24.8h, v2.8h\n"
+    "ldr d24, [x24, x10]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v2.4h\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "smlal v11.4s, v26.4h, v2.4h\n"
+    "smlal2 v8.4s, v26.8h, v2.8h\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "ldr d2, [x3, #0x88]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x15, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v3.4h\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "ldr d3, [x3, #0x90]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v20.4s, v23.8h, v4.8h\n"
+    "ldr d23, [x9, x10]\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v18.4s, v28.4h, v4.4h\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "ldr d28, [x12, x10]\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "smlal v10.4s, v22.4h, v4.4h\n"
+    "smlal2 v9.4s, v22.8h, v4.8h\n"
+    "ldr d4, [x3, #0x98]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x27, x10]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "smlal v11.4s, v27.4h, v0.4h\n"
+    "smlal2 v8.4s, v27.8h, v0.8h\n"
+    "smlal v10.4s, v23.4h, v0.4h\n"
+    "smlal2 v9.4s, v23.8h, v0.8h\n"
+    "ldr d0, [x3, #0xa0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v26.4h, v1.4h\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "smlal v11.4s, v23.4h, v1.4h\n"
+    "smlal2 v8.4s, v23.8h, v1.8h\n"
+    "smlal v10.4s, v31.4h, v1.4h\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "ldr d1, [x3, #0xa8]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v20.4s, v26.8h, v2.8h\n"
+    "ldr d26, [x7, x10]\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v8.4s, v31.8h, v2.8h\n"
+    "smlal v10.4s, v30.4h, v2.4h\n"
+    "smlal2 v9.4s, v30.8h, v2.8h\n"
+    "ldr d2, [x3, #0xb0]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x26, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v11.4s, v30.4h, v3.4h\n"
+    "smlal2 v8.4s, v30.8h, v3.8h\n"
+    "smlal v10.4s, v28.4h, v3.4h\n"
+    "smlal2 v9.4s, v28.8h, v3.8h\n"
+    "ldr d3, [x3, #0xb8]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x23, x10]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v22.4h, v4.4h\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "smlal v11.4s, v28.4h, v4.4h\n"
+    "smlal2 v8.4s, v28.8h, v4.8h\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0xc0]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "smlal2 v20.4s, v27.8h, v0.8h\n"
+    "ldr d27, [x22, x10]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v0.4h\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v8.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x20, x10]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v0.4h\n"
+    "smlal2 v9.4s, v24.8h, v0.8h\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v20.4s, v23.8h, v1.8h\n"
+    "smlal v18.4s, v31.4h, v1.4h\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v8.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x13, x10]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v1.4h\n"
+    "smlal2 v9.4s, v27.8h, v1.8h\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v20.4s, v31.8h, v2.8h\n"
+    "smlal v18.4s, v30.4h, v2.4h\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "smlal v11.4s, v27.4h, v2.4h\n"
+    "smlal2 v8.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x10]\n"
+    "add x10, x10, #0x8\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v20.4s, v30.8h, v3.8h\n"
+    "smlal v18.4s, v28.4h, v3.4h\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v20.4s, v28.8h, v4.8h\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "smlal v10.4s, v27.4h, v4.4h\n"
+    "smlal2 v9.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+    "and v1.16b, v15.16b, v21.16b\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "and v29.16b, v20.16b, v14.16b\n"
+    "and v3.16b, v18.16b, v21.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v2.16b, v5.16b, v14.16b\n"
+    "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v1.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+    "and v0.16b, v11.16b, v21.16b\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v15.4s, v15.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v29.4s\n"
+    "sqadd v18.4s, v18.4s, v3.4s\n"
+    "sqadd v5.4s, v5.4s, v2.4s\n"
+    "and v27.16b, v8.16b, v14.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v19.4s\n"
+    "srshl v20.4s, v20.4s, v14.4s\n"
+    "srshl v18.4s, v18.4s, v21.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "smin v15.4s, v15.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v19.4s\n"
+    "add v18.4s, v18.4s, v19.4s\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "add v5.4s, v5.4s, v19.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v5.4s, v5.4s, v12.4s\n"
+    "uzp1 v15.16b, v15.16b, v20.16b\n"
+    "sqadd v11.4s, v11.4s, v0.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x17, x1]\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "sqadd v8.4s, v8.4s, v27.4s\n"
+    "srshl v11.4s, v11.4s, v21.4s\n"
+    "and v30.16b, v10.16b, v21.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "uzp1 v18.16b, v18.16b, v5.16b\n"
+    "add v11.4s, v11.4s, v19.4s\n"
+    "srshl v8.4s, v8.4s, v14.4s\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str d18, [x16, x1]\n"
+    "smin v11.4s, v11.4s, v12.4s\n"
+    "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+    "add v8.4s, v8.4s, v19.4s\n"
+    "sqadd v10.4s, v10.4s, v30.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "smin v8.4s, v8.4s, v12.4s\n"
+    "and v6.16b, v9.16b, v14.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "srshl v10.4s, v10.4s, v21.4s\n"
+    "uzp1 v11.16b, v11.16b, v8.16b\n"
+    "add v10.4s, v10.4s, v19.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str d11, [x6, x1]\n"
+    "smin v10.4s, v10.4s, v12.4s\n"
+    "sqadd v9.4s, v9.4s, v6.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "srshl v9.4s, v9.4s, v14.4s\n"
+    "add v9.4s, v9.4s, v19.4s\n"
+    "smin v9.4s, v9.4s, v12.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "uzp1 v10.16b, v10.16b, v9.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d10, [x8, x1]\n"
+    "add x1, x1, #0x8\n"
+    "beq 124f\n"
+    "add x3, x3, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x4, #2, 5f\n"
+    "ld1 { v15.4s }, [x12], #0x10\n"
+    "tbz x4, #1, 4f\n"
+    "ld1 { v20.d }[0], [x12], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v20.s }[2], [x12]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v20.s }[0], [x12]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x4, #1, 6f\n"
+    "ld1 { v15.d }[0], [x12], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[2], [x12]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[0], [x12]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v18.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "mov v5.16b, v20.16b\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v11.16b, v15.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "mov v8.16b, v20.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "mov v10.16b, v15.16b\n"
+    "ldr d4, [x3, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "mov v9.16b, v20.16b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "add x28, x28, x10\n"
+    "add x27, x27, x10\n"
+    "add x26, x26, x10\n"
+    "add x13, x13, x10\n"
+    "add x24, x24, x10\n"
+    "add x23, x23, x10\n"
+    "add x22, x22, x10\n"
+    "add x21, x21, x10\n"
+    "add x20, x20, x10\n"
+    "add x0, x0, x10\n"
+    "tbz x4, #2, 9f\n"
+    "ld1 { v31.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v29.s }[0], [x26], #0x4\n"
+    "ld1 { v28.s }[0], [x13], #0x4\n"
+    "ld1 { v27.s }[0], [x24], #0x4\n"
+    "ld1 { v23.s }[0], [x23], #0x4\n"
+    "ld1 { v25.s }[0], [x22], #0x4\n"
+    "ld1 { v24.s }[0], [x21], #0x4\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "ld1 { v22.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 8f\n"
+    "ld1 { v31.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v29.h }[2], [x26], #0x2\n"
+    "ld1 { v28.h }[2], [x13], #0x2\n"
+    "ld1 { v27.h }[2], [x24], #0x2\n"
+    "ld1 { v23.h }[2], [x23], #0x2\n"
+    "ld1 { v25.h }[2], [x22], #0x2\n"
+    "ld1 { v24.h }[2], [x21], #0x2\n"
+    "ld1 { v26.h }[2], [x20], #0x2\n"
+    "ld1 { v22.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v29.b }[6], [x26]\n"
+    "ld1 { v28.b }[6], [x13]\n"
+    "ld1 { v27.b }[6], [x24]\n"
+    "ld1 { v23.b }[6], [x23]\n"
+    "ld1 { v25.b }[6], [x22]\n"
+    "ld1 { v24.b }[6], [x21]\n"
+    "ld1 { v26.b }[6], [x20]\n"
+    "ld1 { v22.b }[6], [x0]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v29.b }[4], [x26]\n"
+    "ld1 { v28.b }[4], [x13]\n"
+    "ld1 { v27.b }[4], [x24]\n"
+    "ld1 { v23.b }[4], [x23]\n"
+    "ld1 { v25.b }[4], [x22]\n"
+    "ld1 { v24.b }[4], [x21]\n"
+    "ld1 { v26.b }[4], [x20]\n"
+    "ld1 { v22.b }[4], [x0]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x4, #1, 10f\n"
+    "ld1 { v31.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v29.h }[0], [x26], #0x2\n"
+    "ld1 { v28.h }[0], [x13], #0x2\n"
+    "ld1 { v27.h }[0], [x24], #0x2\n"
+    "ld1 { v23.h }[0], [x23], #0x2\n"
+    "ld1 { v25.h }[0], [x22], #0x2\n"
+    "ld1 { v24.h }[0], [x21], #0x2\n"
+    "ld1 { v26.h }[0], [x20], #0x2\n"
+    "ld1 { v22.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v29.b }[2], [x26]\n"
+    "ld1 { v28.b }[2], [x13]\n"
+    "ld1 { v27.b }[2], [x24]\n"
+    "ld1 { v23.b }[2], [x23]\n"
+    "ld1 { v25.b }[2], [x22]\n"
+    "ld1 { v24.b }[2], [x21]\n"
+    "ld1 { v26.b }[2], [x20]\n"
+    "ld1 { v22.b }[2], [x0]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v29.b }[0], [x26]\n"
+    "ld1 { v28.b }[0], [x13]\n"
+    "ld1 { v27.b }[0], [x24]\n"
+    "ld1 { v23.b }[0], [x23]\n"
+    "ld1 { v25.b }[0], [x22]\n"
+    "ld1 { v24.b }[0], [x21]\n"
+    "ld1 { v26.b }[0], [x20]\n"
+    "ld1 { v22.b }[0], [x0]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ldr x20, [x25, #0x50]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ssubl v29.8h, v29.8b, v7.8b\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v11.4s, v29.4h, v0.4h\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal2 v8.4s, v29.8h, v0.8h\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v28.4h, v0.4h\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "ssubl v22.8h, v22.8b, v7.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "add x20, x20, x10\n"
+    "smlal v18.4s, v27.4h, v1.4h\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "smlal2 v8.4s, v28.8h, v1.8h\n"
+    "smlal v10.4s, v23.4h, v1.4h\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "smlal v11.4s, v23.4h, v2.4h\n"
+    "smlal2 v8.4s, v23.8h, v2.8h\n"
+    "tbz x4, #2, 13f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 12f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x4, #1, 14f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "ldr x28, [x25, #0x58]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "add x28, x28, x10\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v10.4s, v31.4h, v2.4h\n"
+    "smlal2 v9.4s, v31.8h, v2.8h\n"
+    "smlal v11.4s, v31.4h, v3.4h\n"
+    "smlal2 v8.4s, v31.8h, v3.8h\n"
+    "tbz x4, #2, 17f\n"
+    "ld1 { v30.s }[0], [x28], #0x4\n"
+    "tbz x4, #1, 16f\n"
+    "ld1 { v30.h }[2], [x28], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[6], [x28]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[4], [x28]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x4, #1, 18f\n"
+    "ld1 { v30.h }[0], [x28], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[2], [x28]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[0], [x28]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "ldr x0, [x25, #0x60]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "add x0, x0, x10\n"
+    "smlal v10.4s, v30.4h, v3.4h\n"
+    "smlal2 v9.4s, v30.8h, v3.8h\n"
+    "tbz x4, #2, 21f\n"
+    "ld1 { v27.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 20f\n"
+    "ld1 { v27.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[6], [x0]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[4], [x0]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x4, #1, 22f\n"
+    "ld1 { v27.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[2], [x0]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[0], [x0]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "smlal v11.4s, v30.4h, v4.4h\n"
+    "ldr d0, [x3, #0x28]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v8.4s, v30.8h, v4.8h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "add x7, x7, x10\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v20.4s, v29.8h, v0.8h\n"
+    "smlal v18.4s, v28.4h, v0.4h\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "smlal v11.4s, v22.4h, v0.4h\n"
+    "smlal2 v8.4s, v22.8h, v0.8h\n"
+    "tbz x4, #2, 25f\n"
+    "ld1 { v25.s }[0], [x7], #0x4\n"
+    "tbz x4, #1, 24f\n"
+    "ld1 { v25.h }[2], [x7], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[6], [x7]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[4], [x7]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x4, #1, 26f\n"
+    "ld1 { v25.h }[0], [x7], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[2], [x7]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[0], [x7]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d1, [x3, #0x30]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v25.4h, v0.4h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal2 v9.4s, v25.8h, v0.8h\n"
+    "add x26, x26, x10\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "smlal v18.4s, v23.4h, v1.4h\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "smlal v11.4s, v25.4h, v1.4h\n"
+    "smlal2 v8.4s, v25.8h, v1.8h\n"
+    "tbz x4, #2, 29f\n"
+    "ld1 { v24.s }[0], [x26], #0x4\n"
+    "tbz x4, #1, 28f\n"
+    "ld1 { v24.h }[2], [x26], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[6], [x26]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[4], [x26]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x4, #1, 30f\n"
+    "ld1 { v24.h }[0], [x26], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[2], [x26]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[0], [x26]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d2, [x3, #0x38]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v1.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal2 v9.4s, v24.8h, v1.8h\n"
+    "add x23, x23, x10\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v20.4s, v23.8h, v2.8h\n"
+    "smlal v18.4s, v31.4h, v2.4h\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "smlal v11.4s, v24.4h, v2.4h\n"
+    "smlal2 v8.4s, v24.8h, v2.8h\n"
+    "tbz x4, #2, 33f\n"
+    "ld1 { v27.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 32f\n"
+    "ld1 { v27.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[6], [x23]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[4], [x23]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x4, #1, 34f\n"
+    "ld1 { v27.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[2], [x23]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[0], [x23]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d3, [x3, #0x40]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v2.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v9.4s, v27.8h, v2.8h\n"
+    "add x20, x20, x10\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v20.4s, v31.8h, v3.8h\n"
+    "smlal v18.4s, v30.4h, v3.4h\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "smlal v11.4s, v27.4h, v3.4h\n"
+    "smlal2 v8.4s, v27.8h, v3.8h\n"
+    "tbz x4, #2, 37f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 36f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x4, #1, 38f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d4, [x3, #0x48]\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v10.4s, v23.4h, v3.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v9.4s, v23.8h, v3.8h\n"
+    "add x22, x22, x10\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "smlal v11.4s, v23.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "tbz x4, #2, 41f\n"
+    "ld1 { v28.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 40f\n"
+    "ld1 { v28.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[6], [x22]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[4], [x22]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x4, #1, 42f\n"
+    "ld1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[2], [x22]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[0], [x22]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d0, [x3, #0x50]\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v10.4s, v28.4h, v4.4h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "add x13, x13, x10\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v20.4s, v22.8h, v0.8h\n"
+    "smlal v18.4s, v25.4h, v0.4h\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "tbz x4, #2, 45f\n"
+    "ld1 { v31.s }[0], [x13], #0x4\n"
+    "tbz x4, #1, 44f\n"
+    "ld1 { v31.h }[2], [x13], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[6], [x13]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[4], [x13]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x4, #1, 46f\n"
+    "ld1 { v31.h }[0], [x13], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[2], [x13]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[0], [x13]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "ldr x21, [x25, #0x98]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v11.4s, v31.4h, v0.4h\n"
+    "smlal2 v8.4s, v31.8h, v0.8h\n"
+    "add x21, x21, x10\n"
+    "tbz x4, #2, 49f\n"
+    "ld1 { v30.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 48f\n"
+    "ld1 { v30.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[6], [x21]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[4], [x21]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x4, #1, 50f\n"
+    "ld1 { v30.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[2], [x21]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[0], [x21]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d1, [x3, #0x58]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v10.4s, v30.4h, v0.4h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal2 v9.4s, v30.8h, v0.8h\n"
+    "add x14, x14, x10\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v20.4s, v25.8h, v1.8h\n"
+    "smlal v18.4s, v24.4h, v1.4h\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "smlal v11.4s, v30.4h, v1.4h\n"
+    "smlal2 v8.4s, v30.8h, v1.8h\n"
+    "tbz x4, #2, 53f\n"
+    "ld1 { v26.s }[0], [x14], #0x4\n"
+    "tbz x4, #1, 52f\n"
+    "ld1 { v26.h }[2], [x14], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[6], [x14]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[4], [x14]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x4, #1, 54f\n"
+    "ld1 { v26.h }[0], [x14], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[2], [x14]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[0], [x14]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d2, [x3, #0x60]\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v10.4s, v26.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal2 v9.4s, v26.8h, v1.8h\n"
+    "add x11, x11, x10\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v20.4s, v24.8h, v2.8h\n"
+    "smlal v18.4s, v27.4h, v2.4h\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "smlal v11.4s, v26.4h, v2.4h\n"
+    "smlal2 v8.4s, v26.8h, v2.8h\n"
+    "tbz x4, #2, 57f\n"
+    "ld1 { v25.s }[0], [x11], #0x4\n"
+    "tbz x4, #1, 56f\n"
+    "ld1 { v25.h }[2], [x11], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[6], [x11]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[4], [x11]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x4, #1, 58f\n"
+    "ld1 { v25.h }[0], [x11], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[2], [x11]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[0], [x11]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d3, [x3, #0x68]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "add x24, x24, x10\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "smlal v18.4s, v23.4h, v3.4h\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "tbz x4, #2, 61f\n"
+    "ld1 { v24.s }[0], [x24], #0x4\n"
+    "tbz x4, #1, 60f\n"
+    "ld1 { v24.h }[2], [x24], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[6], [x24]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[4], [x24]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x4, #1, 62f\n"
+    "ld1 { v24.h }[0], [x24], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[2], [x24]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[0], [x24]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d4, [x3, #0x70]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "add x0, x0, x10\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v20.4s, v23.8h, v4.8h\n"
+    "smlal v18.4s, v28.4h, v4.4h\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "tbz x4, #2, 65f\n"
+    "ld1 { v22.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 64f\n"
+    "ld1 { v22.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[6], [x0]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[4], [x0]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x4, #1, 66f\n"
+    "ld1 { v22.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[2], [x0]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[0], [x0]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d0, [x3, #0x78]\n"
+    "ssubl v22.8h, v22.8b, v7.8b\n"
+    "smlal v10.4s, v22.4h, v4.4h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal2 v9.4s, v22.8h, v4.8h\n"
+    "add x15, x15, x10\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "tbz x4, #2, 69f\n"
+    "ld1 { v27.s }[0], [x15], #0x4\n"
+    "tbz x4, #1, 68f\n"
+    "ld1 { v27.h }[2], [x15], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[6], [x15]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[4], [x15]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x4, #1, 70f\n"
+    "ld1 { v27.h }[0], [x15], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[2], [x15]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[0], [x15]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "ldr x9, [x25, #0xc8]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v11.4s, v27.4h, v0.4h\n"
+    "smlal2 v8.4s, v27.8h, v0.8h\n"
+    "add x9, x9, x10\n"
+    "tbz x4, #2, 73f\n"
+    "ld1 { v23.s }[0], [x9], #0x4\n"
+    "tbz x4, #1, 72f\n"
+    "ld1 { v23.h }[2], [x9], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[6], [x9]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[4], [x9]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x4, #1, 74f\n"
+    "ld1 { v23.h }[0], [x9], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[2], [x9]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[0], [x9]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d1, [x3, #0x80]\n"
+    "ssubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v10.4s, v23.4h, v0.4h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal2 v9.4s, v23.8h, v0.8h\n"
+    "add x27, x27, x10\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "smlal v18.4s, v26.4h, v1.4h\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "smlal v11.4s, v23.4h, v1.4h\n"
+    "smlal2 v8.4s, v23.8h, v1.8h\n"
+    "tbz x4, #2, 77f\n"
+    "ld1 { v31.s }[0], [x27], #0x4\n"
+    "tbz x4, #1, 76f\n"
+    "ld1 { v31.h }[2], [x27], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[6], [x27]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[4], [x27]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x4, #1, 78f\n"
+    "ld1 { v31.h }[0], [x27], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[2], [x27]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[0], [x27]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d2, [x3, #0x88]\n"
+    "ssubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v10.4s, v31.4h, v1.4h\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "add x28, x28, x10\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v20.4s, v26.8h, v2.8h\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v8.4s, v31.8h, v2.8h\n"
+    "tbz x4, #2, 81f\n"
+    "ld1 { v30.s }[0], [x28], #0x4\n"
+    "tbz x4, #1, 80f\n"
+    "ld1 { v30.h }[2], [x28], #0x2\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[6], [x28]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[4], [x28]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x4, #1, 82f\n"
+    "ld1 { v30.h }[0], [x28], #0x2\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[2], [x28]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[0], [x28]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d3, [x3, #0x90]\n"
+    "ssubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v10.4s, v30.4h, v2.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v9.4s, v30.8h, v2.8h\n"
+    "add x12, x12, x10\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v11.4s, v30.4h, v3.4h\n"
+    "smlal2 v8.4s, v30.8h, v3.8h\n"
+    "tbz x4, #2, 85f\n"
+    "ld1 { v28.s }[0], [x12], #0x4\n"
+    "tbz x4, #1, 84f\n"
+    "ld1 { v28.h }[2], [x12], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[6], [x12]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[4], [x12]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x4, #1, 86f\n"
+    "ld1 { v28.h }[0], [x12], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[2], [x12]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[0], [x12]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d4, [x3, #0x98]\n"
+    "ssubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v10.4s, v28.4h, v3.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v9.4s, v28.8h, v3.8h\n"
+    "add x7, x7, x10\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "smlal v18.4s, v22.4h, v4.4h\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "smlal v11.4s, v28.4h, v4.4h\n"
+    "smlal2 v8.4s, v28.8h, v4.8h\n"
+    "tbz x4, #2, 89f\n"
+    "ld1 { v26.s }[0], [x7], #0x4\n"
+    "tbz x4, #1, 88f\n"
+    "ld1 { v26.h }[2], [x7], #0x2\n"
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[6], [x7]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[4], [x7]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x4, #1, 90f\n"
+    "ld1 { v26.h }[0], [x7], #0x2\n"
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[2], [x7]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[0], [x7]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d0, [x3, #0xa0]\n"
+    "ssubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "ssubl v0.8h, v0.8b, v13.8b\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "add x26, x26, x10\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "smlal2 v20.4s, v27.8h, v0.8h\n"
+    "smlal v18.4s, v23.4h, v0.4h\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "tbz x4, #2, 93f\n"
+    "ld1 { v25.s }[0], [x26], #0x4\n"
+    "tbz x4, #1, 92f\n"
+    "ld1 { v25.h }[2], [x26], #0x2\n"
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[6], [x26]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[4], [x26]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x4, #1, 94f\n"
+    "ld1 { v25.h }[0], [x26], #0x2\n"
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[2], [x26]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[0], [x26]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "ldr x23, [x25, #0xf8]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v8.4s, v25.8h, v0.8h\n"
+    "add x23, x23, x10\n"
+    "tbz x4, #2, 97f\n"
+    "ld1 { v24.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 96f\n"
+    "ld1 { v24.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[6], [x23]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[4], [x23]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x4, #1, 98f\n"
+    "ld1 { v24.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[2], [x23]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[0], [x23]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d1, [x3, #0xa8]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v0.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "ssubl v1.8h, v1.8b, v13.8b\n"
+    "smlal2 v9.4s, v24.8h, v0.8h\n"
+    "add x22, x22, x10\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v20.4s, v23.8h, v1.8h\n"
+    "smlal v18.4s, v31.4h, v1.4h\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v8.4s, v24.8h, v1.8h\n"
+    "tbz x4, #2, 101f\n"
+    "ld1 { v27.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 100f\n"
+    "ld1 { v27.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[6], [x22]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[4], [x22]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x4, #1, 102f\n"
+    "ld1 { v27.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[2], [x22]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[0], [x22]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d2, [x3, #0xb0]\n"
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v1.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "ssubl v2.8h, v2.8b, v13.8b\n"
+    "smlal2 v9.4s, v27.8h, v1.8h\n"
+    "add x20, x20, x10\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v20.4s, v31.8h, v2.8h\n"
+    "smlal v18.4s, v30.4h, v2.4h\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "smlal v11.4s, v27.4h, v2.4h\n"
+    "smlal2 v8.4s, v27.8h, v2.8h\n"
+    "tbz x4, #2, 105f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 104f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x4, #1, 106f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d3, [x3, #0xb8]\n"
+    "ssubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "ssubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "add x13, x13, x10\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v20.4s, v30.8h, v3.8h\n"
+    "smlal v18.4s, v28.4h, v3.4h\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "tbz x4, #2, 109f\n"
+    "ld1 { v24.s }[0], [x13], #0x4\n"
+    "tbz x4, #1, 108f\n"
+    "ld1 { v24.h }[2], [x13], #0x2\n"
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[6], [x13]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[4], [x13]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x4, #1, 110f\n"
+    "ld1 { v24.h }[0], [x13], #0x2\n"
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[2], [x13]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[0], [x13]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d4, [x3, #0xc0]\n"
+    "ssubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "ssubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "add x21, x21, x10\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v20.4s, v28.8h, v4.8h\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "tbz x4, #2, 113f\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 112f\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x4, #1, 114f\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "ssubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v4.4h\n"
+    "smlal2 v9.4s, v27.8h, v4.8h\n"
+    "tbz x4, #2, 117f\n"
+    "ld1 { v6.4s }, [x2], #0x10\n"
+    "ld1 { v21.4s }, [x5], #0x10\n"
+    "tbz x4, #1, 116f\n"
+    "ld1 { v17.d }[0], [x2], #0x8\n"
+    "ld1 { v14.d }[0], [x5], #0x8\n"
+    "tbz x4, #0, 119f\n"
+    "ld1 { v17.s }[2], [x2]\n"
+    "ld1 { v14.s }[2], [x5]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 119f\n"
+    "ld1 { v17.s }[0], [x2]\n"
+    "ld1 { v14.s }[0], [x5]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x4, #1, 118f\n"
+    "ld1 { v6.d }[0], [x2], #0x8\n"
+    "ld1 { v21.d }[0], [x5], #0x8\n"
+    "tbz x4, #0, 119f\n"
+    "ld1 { v6.s }[2], [x2]\n"
+    "ld1 { v21.s }[2], [x5]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 119f\n"
+    "ld1 { v6.s }[0], [x2]\n"
+    "ld1 { v21.s }[0], [x5]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "add x17, x17, x1\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "add x16, x16, x1\n"
+    "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+    "add x6, x6, x1\n"
+    "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+    "add x8, x8, x1\n"
+    "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+    "and v1.16b, v15.16b, v21.16b\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "and v29.16b, v20.16b, v14.16b\n"
+    "and v3.16b, v18.16b, v21.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v2.16b, v5.16b, v14.16b\n"
+    "and v0.16b, v11.16b, v21.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v1.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+    "sqadd v20.4s, v20.4s, v29.4s\n"
+    "sqadd v18.4s, v18.4s, v3.4s\n"
+    "srshl v15.4s, v15.4s, v21.4s\n"
+    "sqadd v5.4s, v5.4s, v2.4s\n"
+    "srshl v20.4s, v20.4s, v14.4s\n"
+    "srshl v18.4s, v18.4s, v21.4s\n"
+    "add v15.4s, v15.4s, v19.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v19.4s\n"
+    "smin v15.4s, v15.4s, v12.4s\n"
+    "add v18.4s, v18.4s, v19.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "add v5.4s, v5.4s, v19.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "uzp1 v15.16b, v15.16b, v20.16b\n"
+    "smin v5.4s, v5.4s, v12.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "sqadd v11.4s, v11.4s, v0.4s\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "and v27.16b, v8.16b, v14.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "uzp1 v18.16b, v18.16b, v5.16b\n"
+    "srshl v11.4s, v11.4s, v21.4s\n"
+    "and v30.16b, v10.16b, v21.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "add v11.4s, v11.4s, v19.4s\n"
+    "sqadd v8.4s, v8.4s, v27.4s\n"
+    "and v6.16b, v9.16b, v14.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "smin v11.4s, v11.4s, v12.4s\n"
+    "srshl v8.4s, v8.4s, v14.4s\n"
+    "sqadd v10.4s, v10.4s, v30.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "add v8.4s, v8.4s, v19.4s\n"
+    "srshl v10.4s, v10.4s, v21.4s\n"
+    "sqadd v9.4s, v9.4s, v6.4s\n"
+    "smin v8.4s, v8.4s, v12.4s\n"
+    "add v10.4s, v10.4s, v19.4s\n"
+    "srshl v9.4s, v9.4s, v14.4s\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "smin v10.4s, v10.4s, v12.4s\n"
+    "uzp1 v11.16b, v11.16b, v8.16b\n"
+    "add v9.4s, v9.4s, v19.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "smin v9.4s, v9.4s, v12.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "uzp1 v10.16b, v10.16b, v9.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "tbz x4, #2, 121f\n"
+    "st1 { v15.s }[0], [x17], #0x4\n"
+    "st1 { v18.s }[0], [x16], #0x4\n"
+    "st1 { v11.s }[0], [x6], #0x4\n"
+    "st1 { v10.s }[0], [x8], #0x4\n"
+    "tbz x4, #1, 120f\n"
+    "st1 { v15.h }[2], [x17], #0x2\n"
+    "st1 { v18.h }[2], [x16], #0x2\n"
+    "st1 { v11.h }[2], [x6], #0x2\n"
+    "st1 { v10.h }[2], [x8], #0x2\n"
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[6], [x17], #0x1\n"
+    "st1 { v18.b }[6], [x16], #0x1\n"
+    "st1 { v11.b }[6], [x6], #0x1\n"
+    "st1 { v10.b }[6], [x8], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[4], [x17], #0x1\n"
+    "st1 { v18.b }[4], [x16], #0x1\n"
+    "st1 { v11.b }[4], [x6], #0x1\n"
+    "st1 { v10.b }[4], [x8], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x4, #1, 122f\n"
+    "st1 { v15.h }[0], [x17], #0x2\n"
+    "st1 { v18.h }[0], [x16], #0x2\n"
+    "st1 { v11.h }[0], [x6], #0x2\n"
+    "st1 { v10.h }[0], [x8], #0x2\n"
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[2], [x17], #0x1\n"
+    "st1 { v18.b }[2], [x16], #0x1\n"
+    "st1 { v11.b }[2], [x6], #0x1\n"
+    "st1 { v10.b }[2], [x8], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[0], [x17], #0x1\n"
+    "st1 { v18.b }[0], [x16], #0x1\n"
+    "st1 { v11.b }[0], [x6], #0x1\n"
+    "st1 { v10.b }[0], [x8], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+
+    "124:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000..4e845cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+struct a64_s8q_nhwc_generic_output9_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int n_output_points = 9;
+
+  kern_type kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..ad5545a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp

@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v12.4s }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "ld1r { v10.16b }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v9.16b }, [x20]\n"
+    "ld1r { v8.4s }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "ld1r { v6.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "mov x11, #0x0\n"
+    "ld1r { v5.4s }, [x19]\n"
+    "lsr x10, %x[n_channels], #0x2\n"
+    "cbz x10, 6f\n"
+    "1:"  // Channel loop
+    "movi v27.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x19, x11, #0x2\n"
+    "ldr q27, [%x[bias], x19]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "mov v26.16b, v27.16b\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v25.16b, v27.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, %x[n_points], #0x1\n"
+    "mov v24.16b, v27.16b\n"
+    "ldr s4, [x9, x11]\n"
+    "mov v23.16b, v27.16b\n"
+    "mov v22.16b, v27.16b\n"
+    "ldr s3, [x28, x11]\n"
+    "mov v21.16b, v27.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v20.16b, v27.16b\n"
+    "ldr s2, [x27, x11]\n"
+    "mov v19.16b, v27.16b\n"
+    "ssubl v16.8h, v16.8b, v9.8b\n"
+    "ldr s1, [x26, x11]\n"
+    "ssubl v4.8h, v4.8b, v10.8b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "ssubl v3.8h, v3.8b, v10.8b\n"
+    "ldr s0, [x25, x11]\n"
+    "ssubl v2.8h, v2.8b, v10.8b\n"
+    "ssubl v1.8h, v1.8b, v10.8b\n"
+    "ldr s31, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "ssubl v0.8h, v0.8b, v10.8b\n"
+    "ldr s30, [x23, x11]\n"
+    "ldr s29, [x22, x11]\n"
+    "ssubl v31.8h, v31.8b, v10.8b\n"
+    "ldr x21, [x20], #0x8\n"
+    "ssubl v30.8h, v30.8b, v10.8b\n"
+    "ldr s28, [x21, x11]\n"
+    "ssubl v29.8h, v29.8b, v10.8b\n"
+    "ssubl v28.8h, v28.8b, v10.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, x19, #0x1\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "ldr s4, [x9, x11]\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "ldr s3, [x28, x11]\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "ldr s2, [x27, x11]\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "ssubl v4.8h, v4.8b, v10.8b\n"
+    "ldr s1, [x26, x11]\n"
+    "ssubl v3.8h, v3.8b, v10.8b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "ssubl v2.8h, v2.8b, v10.8b\n"
+    "ldr s0, [x25, x11]\n"
+    "ssubl v16.8h, v16.8b, v9.8b\n"
+    "ssubl v1.8h, v1.8b, v10.8b\n"
+    "ldr s31, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "ssubl v0.8h, v0.8b, v10.8b\n"
+    "ldr s30, [x23, x11]\n"
+    "ldr s29, [x22, x11]\n"
+    "ssubl v31.8h, v31.8b, v10.8b\n"
+    "ldr x21, [x20], #0x8\n"
+    "ssubl v30.8h, v30.8b, v10.8b\n"
+    "ldr s28, [x21, x11]\n"
+    "ssubl v29.8h, v29.8b, v10.8b\n"
+    "ssubl v28.8h, v28.8b, v10.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x19, x11, #0x2\n"
+    "ldr q6, [%x[rq_mul_ptr], x19]\n"
+    "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v27.4s, v27.4s, v7.4s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "sshl v26.4s, v26.4s, v7.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v7.4s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+    "sshl v24.4s, v24.4s, v7.4s\n"
+    "and v16.16b, v27.16b, v5.16b\n"
+    "and v18.16b, v26.16b, v5.16b\n"
+    "and v17.16b, v25.16b, v5.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sqadd v26.4s, v26.4s, v18.4s\n"
+    "sqadd v25.4s, v25.4s, v17.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+    "srshl v27.4s, v27.4s, v5.4s\n"
+    "srshl v26.4s, v26.4s, v5.4s\n"
+    "srshl v25.4s, v25.4s, v5.4s\n"
+    "and v16.16b, v24.16b, v5.16b\n"
+    "add v27.4s, v27.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v8.4s\n"
+    "add v25.4s, v25.4s, v8.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v27.4s, v27.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v12.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "smin v27.4s, v27.4s, v11.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v25.4s, v25.4s, v12.4s\n"
+    "srshl v24.4s, v24.4s, v5.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x27, x11]\n"
+    "add v24.4s, v24.4s, v8.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x26, x11]\n"
+    "smax v24.4s, v24.4s, v12.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x25, x11]\n"
+    "sshl v23.4s, v23.4s, v7.4s\n"
+    "sshl v22.4s, v22.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sshl v21.4s, v21.4s, v7.4s\n"
+    "and v17.16b, v23.16b, v5.16b\n"
+    "and v16.16b, v22.16b, v5.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x24, x11]\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v21.16b, v5.16b\n"
+    "sshl v20.4s, v20.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v7.4s\n"
+    "srshl v23.4s, v23.4s, v5.4s\n"
+    "srshl v22.4s, v22.4s, v5.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+    "add v23.4s, v23.4s, v8.4s\n"
+    "add v22.4s, v22.4s, v8.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "and v17.16b, v20.16b, v5.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+    "smax v23.4s, v23.4s, v12.4s\n"
+    "srshl v21.4s, v21.4s, v5.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v19.16b, v5.16b\n"
+    "smin v23.4s, v23.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v8.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "smax v21.4s, v21.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v5.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v5.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x23, x11]\n"
+    "add v19.4s, v19.4s, v8.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x22, x11]\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x21, x11]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x20, x11]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x19, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x10, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v27.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x19, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v27.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v27.s }[2], [x19], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v27.s }[0], [x19], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+
+    "9:"  // Oddments: Load bias: Done
+    "mov v26.16b, v27.16b\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v25.16b, v27.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add x9, x9, x11\n"
+    "mov v24.16b, v27.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v23.16b, v27.16b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "mov v22.16b, v27.16b\n"
+    "add x28, x28, x11\n"
+    "mov v21.16b, v27.16b\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "mov v20.16b, v27.16b\n"
+    "add x27, x27, x11\n"
+    "mov v19.16b, v27.16b\n"
+    "ldr x21, [x20], #0x8\n"
+    "ssubl v16.8h, v16.8b, v9.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h4, [x9], #0x2\n"
+    "ldr h3, [x28], #0x2\n"
+    "ldr h2, [x27], #0x2\n"
+    "ldr h1, [x26], #0x2\n"
+    "ldr h0, [x25], #0x2\n"
+    "ldr h31, [x24], #0x2\n"
+    "ldr h30, [x23], #0x2\n"
+    "ldr h29, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v4.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x28], #0x1\n"
+    "ld1 { v2.b }[2], [x27], #0x1\n"
+    "ld1 { v1.b }[2], [x26], #0x1\n"
+    "ld1 { v0.b }[2], [x25], #0x1\n"
+    "ld1 { v31.b }[2], [x24], #0x1\n"
+    "ld1 { v30.b }[2], [x23], #0x1\n"
+    "ld1 { v29.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ldr b4, [x9], #0x1\n"
+    "ldr b3, [x28], #0x1\n"
+    "ldr b2, [x27], #0x1\n"
+    "ldr b1, [x26], #0x1\n"
+    "ldr b0, [x25], #0x1\n"
+    "ldr b31, [x24], #0x1\n"
+    "ldr b30, [x23], #0x1\n"
+    "ldr b29, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "ssubl v4.8h, v4.8b, v10.8b\n"
+    "subs x19, %x[n_points], #0x1\n"
+    "ssubl v3.8h, v3.8b, v10.8b\n"
+    "ssubl v2.8h, v2.8b, v10.8b\n"
+    "ssubl v1.8h, v1.8b, v10.8b\n"
+    "ssubl v0.8h, v0.8b, v10.8b\n"
+    "ssubl v31.8h, v31.8b, v10.8b\n"
+    "ssubl v30.8h, v30.8b, v10.8b\n"
+    "ssubl v29.8h, v29.8b, v10.8b\n"
+    "ssubl v28.8h, v28.8b, v10.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add x9, x9, x11\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "add x28, x28, x11\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "add x27, x27, x11\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "ldr x21, [x20], #0x8\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "add x26, x26, x11\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "add x25, x25, x11\n"
+    "ssubl v16.8h, v16.8b, v9.8b\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h4, [x9], #0x2\n"
+    "ldr h3, [x28], #0x2\n"
+    "ldr h2, [x27], #0x2\n"
+    "ldr h1, [x26], #0x2\n"
+    "ldr h0, [x25], #0x2\n"
+    "ldr h31, [x24], #0x2\n"
+    "ldr h30, [x23], #0x2\n"
+    "ldr h29, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v4.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x28], #0x1\n"
+    "ld1 { v2.b }[2], [x27], #0x1\n"
+    "ld1 { v1.b }[2], [x26], #0x1\n"
+    "ld1 { v0.b }[2], [x25], #0x1\n"
+    "ld1 { v31.b }[2], [x24], #0x1\n"
+    "ld1 { v30.b }[2], [x23], #0x1\n"
+    "ld1 { v29.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "tbz %x[n_channels], #0, 14f\n"
+    "ldr b4, [x9], #0x1\n"
+    "ldr b3, [x28], #0x1\n"
+    "ldr b2, [x27], #0x1\n"
+    "ldr b1, [x26], #0x1\n"
+    "ldr b0, [x25], #0x1\n"
+    "ldr b31, [x24], #0x1\n"
+    "ldr b30, [x23], #0x1\n"
+    "ldr b29, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "ssubl v4.8h, v4.8b, v10.8b\n"
+    "subs x19, x19, #0x1\n"
+    "ssubl v3.8h, v3.8b, v10.8b\n"
+    "ssubl v2.8h, v2.8b, v10.8b\n"
+    "ssubl v1.8h, v1.8b, v10.8b\n"
+    "ssubl v0.8h, v0.8b, v10.8b\n"
+    "ssubl v31.8h, v31.8b, v10.8b\n"
+    "ssubl v30.8h, v30.8b, v10.8b\n"
+    "ssubl v29.8h, v29.8b, v10.8b\n"
+    "ssubl v28.8h, v28.8b, v10.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v6.d }[0], [x21], #0x8\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v7.d }[0], [x19], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v6.s }[2], [x21], #0x4\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v7.s }[2], [x19], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v6.s }[0], [x21], #0x4\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v7.s }[0], [x19], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v27.4s, v27.4s, v7.4s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "add x27, x27, x11\n"
+    "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "sshl v26.4s, v26.4s, v7.4s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "add x26, x26, x11\n"
+    "sshl v25.4s, v25.4s, v7.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "sshl v24.4s, v24.4s, v7.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x25, x25, x11\n"
+    "and v16.16b, v27.16b, v5.16b\n"
+    "add x24, x24, x11\n"
+    "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+    "add x23, x23, x11\n"
+    "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+    "add x22, x22, x11\n"
+    "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+    "add x21, x21, x11\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x20, x20, x11\n"
+    "and v18.16b, v26.16b, v5.16b\n"
+    "add x19, x19, x11\n"
+    "and v17.16b, v25.16b, v5.16b\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v24.16b, v5.16b\n"
+    "srshl v27.4s, v27.4s, v5.4s\n"
+    "sqadd v26.4s, v26.4s, v18.4s\n"
+    "sqadd v25.4s, v25.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v27.4s, v27.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v5.4s\n"
+    "srshl v25.4s, v25.4s, v5.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "smax v27.4s, v27.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v8.4s\n"
+    "add v25.4s, v25.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v5.4s\n"
+    "smin v27.4s, v27.4s, v11.4s\n"
+    "smax v26.4s, v26.4s, v12.4s\n"
+    "smax v25.4s, v25.4s, v12.4s\n"
+    "add v24.4s, v24.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "smax v24.4s, v24.4s, v12.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sshl v23.4s, v23.4s, v7.4s\n"
+    "sshl v22.4s, v22.4s, v7.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+    "sshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v20.4s, v20.4s, v7.4s\n"
+    "and v17.16b, v23.16b, v5.16b\n"
+    "and v16.16b, v22.16b, v5.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v21.16b, v5.16b\n"
+    "and v17.16b, v20.16b, v5.16b\n"
+    "srshl v23.4s, v23.4s, v5.4s\n"
+    "srshl v22.4s, v22.4s, v5.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add v23.4s, v23.4s, v8.4s\n"
+    "add v22.4s, v22.4s, v8.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "smax v23.4s, v23.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "srshl v21.4s, v21.4s, v5.4s\n"
+    "srshl v20.4s, v20.4s, v5.4s\n"
+    "smin v23.4s, v23.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v8.4s\n"
+    "add v20.4s, v20.4s, v8.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v21.4s, v21.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "sshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+    "and v16.16b, v19.16b, v5.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v5.4s\n"
+    "add v19.4s, v19.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v27.h }[0], [x27], #0x2\n"
+    "st1 { v26.h }[0], [x26], #0x2\n"
+    "st1 { v25.h }[0], [x25], #0x2\n"
+    "st1 { v24.h }[0], [x24], #0x2\n"
+    "st1 { v23.h }[0], [x23], #0x2\n"
+    "st1 { v22.h }[0], [x22], #0x2\n"
+    "st1 { v21.h }[0], [x21], #0x2\n"
+    "st1 { v20.h }[0], [x20], #0x2\n"
+    "st1 { v19.h }[0], [x19], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v27.b }[2], [x27], #0x1\n"
+    "st1 { v26.b }[2], [x26], #0x1\n"
+    "st1 { v25.b }[2], [x25], #0x1\n"
+    "st1 { v24.b }[2], [x24], #0x1\n"
+    "st1 { v23.b }[2], [x23], #0x1\n"
+    "st1 { v22.b }[2], [x22], #0x1\n"
+    "st1 { v21.b }[2], [x21], #0x1\n"
+    "st1 { v20.b }[2], [x20], #0x1\n"
+    "st1 { v19.b }[2], [x19], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v27.b }[0], [x27], #0x1\n"
+    "st1 { v26.b }[0], [x26], #0x1\n"
+    "st1 { v25.b }[0], [x25], #0x1\n"
+    "st1 { v24.b }[0], [x24], #0x1\n"
+    "st1 { v23.b }[0], [x23], #0x1\n"
+    "st1 { v22.b }[0], [x22], #0x1\n"
+    "st1 { v21.b }[0], [x21], #0x1\n"
+    "st1 { v20.b }[0], [x20], #0x1\n"
+    "st1 { v19.b }[0], [x19], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+
+    "24:"  // End
+
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000..b9fef4f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 9;
+  constexpr static unsigned int input_col_quads = 1;
+
+  kern_type kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+
+  a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..2fb6d35
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp

@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "movi v5.16b, #0x1\n"
+    "ldr x22, [%x[inptrs], #0x0]\n"
+    "add SP, SP, #-0x80\n"
+    "ushr v5.4s, v5.4s, #0x8\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "movi v26.4s, #0x0\n"
+    "ldr x19, [%x[inptrs], #0x10]\n"
+    "mov x11, #0x0\n"
+    "movi v1.4s, #0x0\n"
+    "ld1 { v15.16b }, [x22]\n"
+    "mov x10, #0x0\n"
+    "movi v22.4s, #0x0\n"
+    "ld1 { v29.16b }, [x20]\n"
+    "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "movi v25.4s, #0x0\n"
+    "ld1 { v0.16b }, [x19]\n"
+    "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "movi v13.4s, #0x0\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "mov v20.16b, v15.16b\n"
+    "ldr x19, [%x[inptrs], #0x20]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "ext v20.16b, v20.16b, v20.16b, #0x2\n"
+    "ld1r { v4.4s }, [x21]\n"
+    "mov v17.16b, v15.16b\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "ld1 { v7.16b }, [x19]\n"
+    "mov v23.16b, v15.16b\n"
+    "ldp x26, x25, [%x[outptrs], #0x0]\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x6\n"
+    "ldp x24, x23, [%x[outptrs], #0x10]\n"
+    "mov v18.16b, v29.16b\n"
+    "ldp x22, x21, [%x[outptrs], #0x20]\n"
+    "zip1 v15.4s, v15.4s, v17.4s\n"
+    "ldp x20, x19, [%x[outptrs], #0x30]\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x2\n"
+    "ld1r { v14.4s }, [x9]\n"
+    "zip1 v20.4s, v20.4s, v23.4s\n"
+    "ld1r { v27.4s }, [x28]\n"
+    "zip1 v15.4s, v15.4s, v20.4s\n"
+    "ld1r { v23.4s }, [x27]\n"
+    "mov v17.16b, v29.16b\n"
+    "ldr q6, [%x[params], #0x0]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "ldr q8, [%x[params], #0x10]\n"
+    "mov v11.16b, v29.16b\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x6\n"
+    "ldr q10, [%x[params], #0x30]\n"
+    "add %x[params], %x[params], #0x40\n"
+    "zip1 v29.4s, v29.4s, v17.4s\n"
+    "mov v12.16b, v0.16b\n"
+    "ext v12.16b, v12.16b, v12.16b, #0x2\n"
+    "zip1 v18.4s, v18.4s, v11.4s\n"
+    "zip1 v29.4s, v29.4s, v18.4s\n"
+    "mov v17.16b, v0.16b\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "mov v11.16b, v0.16b\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x6\n"
+    "mov v18.16b, v2.16b\n"
+    "zip1 v0.4s, v0.4s, v17.4s\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x2\n"
+    "zip1 v12.4s, v12.4s, v11.4s\n"
+    "zip1 v0.4s, v0.4s, v12.4s\n"
+    "mov v17.16b, v2.16b\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "mov v19.16b, v2.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x6\n"
+    "mov v28.16b, v7.16b\n"
+    "zip1 v2.4s, v2.4s, v17.4s\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+    "zip1 v18.4s, v18.4s, v19.4s\n"
+    "zip1 v2.4s, v2.4s, v18.4s\n"
+    "mov v18.16b, v7.16b\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x4\n"
+    "mov v21.16b, v7.16b\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x6\n"
+    "movi v30.4s, #0x0\n"
+    "zip1 v7.4s, v7.4s, v18.4s\n"
+    "movi v3.4s, #0x0\n"
+    "zip1 v28.4s, v28.4s, v21.4s\n"
+    "zip1 v7.4s, v7.4s, v28.4s\n"
+    "movi v12.4s, #0x0\n"
+    "movi v11.4s, #0x0\n"
+    "movi v19.4s, #0x0\n"
+    "movi v21.4s, #0x0\n"
+    "movi v17.4s, #0x0\n"
+    "movi v16.4s, #0x0\n"
+    "movi v28.4s, #0x0\n"
+    "movi v18.4s, #0x0\n"
+    "movi v20.4s, #0x0\n"
+    "movi v24.4s, #0x0\n"
+    "movi v31.4s, #0x0\n"
+    ".inst 0x4f8fe0ba  // sdot v26.4s, v5.16b, v15.4b[0]\n"
+    ".inst 0x4fafe0a1  // sdot v1.4s, v5.16b, v15.4b[1]\n"
+    ".inst 0x4f8fe8b6  // sdot v22.4s, v5.16b, v15.4b[2]\n"
+    ".inst 0x4fafe8b9  // sdot v25.4s, v5.16b, v15.4b[3]\n"
+    ".inst 0x4f9de0ad  // sdot v13.4s, v5.16b, v29.4b[0]\n"
+    ".inst 0x4fbde0be  // sdot v30.4s, v5.16b, v29.4b[1]\n"
+    ".inst 0x4f9de8a3  // sdot v3.4s, v5.16b, v29.4b[2]\n"
+    ".inst 0x4fbde8ac  // sdot v12.4s, v5.16b, v29.4b[3]\n"
+    ".inst 0x4f80e0ab  // sdot v11.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x4fa0e0b3  // sdot v19.4s, v5.16b, v0.4b[1]\n"
+    ".inst 0x4f80e8b5  // sdot v21.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+    ".inst 0x4f82e0b0  // sdot v16.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e0bc  // sdot v28.4s, v5.16b, v2.4b[1]\n"
+    ".inst 0x4f82e8b2  // sdot v18.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e8b4  // sdot v20.4s, v5.16b, v2.4b[3]\n"
+    ".inst 0x4f87e0b8  // sdot v24.4s, v5.16b, v7.4b[0]\n"
+    ".inst 0x4fa7e0bf  // sdot v31.4s, v5.16b, v7.4b[1]\n"
+    "mov v26.16b, v26.16b\n"
+    "mov v1.16b, v1.16b\n"
+    "mov v22.16b, v22.16b\n"
+    "mov v25.16b, v25.16b\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "movi v13.4s, #0x0\n"
+    ".inst 0x4f87e8ad  // sdot v13.4s, v5.16b, v7.4b[2]\n"
+    "add v1.4s, v1.4s, v30.4s\n"
+    "movi v30.4s, #0x0\n"
+    ".inst 0x4fa7e8be  // sdot v30.4s, v5.16b, v7.4b[3]\n"
+    "add v22.4s, v22.4s, v3.4s\n"
+    "add v25.4s, v25.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v1.4s, v1.4s, v19.4s\n"
+    "add v22.4s, v22.4s, v21.4s\n"
+    "add v25.4s, v25.4s, v17.4s\n"
+    "mov v11.16b, v11.16b\n"
+    "mov v3.16b, v19.16b\n"
+    "mov v19.16b, v21.16b\n"
+    "mov v21.16b, v17.16b\n"
+    "add v11.4s, v11.4s, v16.4s\n"
+    "add v3.4s, v3.4s, v28.4s\n"
+    "add v19.4s, v19.4s, v18.4s\n"
+    "add v21.4s, v21.4s, v20.4s\n"
+    "add v11.4s, v11.4s, v24.4s\n"
+    "add v3.4s, v3.4s, v31.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "add v21.4s, v21.4s, v30.4s\n"
+    "neg v4.4s, v4.4s\n"
+    "mul v26.4s, v26.4s, v4.4s\n"
+    "str q26, [SP, #0x0]\n"
+    "mul v1.4s, v1.4s, v4.4s\n"
+    "mul v22.4s, v22.4s, v4.4s\n"
+    "str q1, [SP, #0x10]\n"
+    "mul v25.4s, v25.4s, v4.4s\n"
+    "mul v11.4s, v11.4s, v4.4s\n"
+    "str q22, [SP, #0x20]\n"
+    "mul v3.4s, v3.4s, v4.4s\n"
+    "str q25, [SP, #0x30]\n"
+    "mul v19.4s, v19.4s, v4.4s\n"
+    "mul v21.4s, v21.4s, v4.4s\n"
+    "str q11, [SP, #0x40]\n"
+    "add v26.4s, v26.4s, v6.4s\n"
+    "str q3, [SP, #0x50]\n"
+    "add v1.4s, v1.4s, v6.4s\n"
+    "str q19, [SP, #0x60]\n"
+    "add v22.4s, v22.4s, v6.4s\n"
+    "add v25.4s, v25.4s, v6.4s\n"
+    "str q21, [SP, #0x70]\n"
+    "add v11.4s, v11.4s, v6.4s\n"
+    "add v3.4s, v3.4s, v6.4s\n"
+    "add v19.4s, v19.4s, v6.4s\n"
+    "add v21.4s, v21.4s, v6.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    ".inst 0x4f8fe11a  // sdot v26.4s, v8.16b, v15.4b[0]\n"
+    "ldr q20, [%x[params], #0x0]\n"
+    "add x11, x11, #0x10\n"
+    ".inst 0x4fafe101  // sdot v1.4s, v8.16b, v15.4b[1]\n"
+    "ldr q4, [%x[params], #0x10]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x4f8fe916  // sdot v22.4s, v8.16b, v15.4b[2]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "cmp %x[n_channels], #0x4\n"
+    ".inst 0x4fafe919  // sdot v25.4s, v8.16b, v15.4b[3]\n"
+    ".inst 0x4f80e10b  // sdot v11.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x4fa0e103  // sdot v3.4s, v8.16b, v0.4b[1]\n"
+    ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e915  // sdot v21.4s, v8.16b, v0.4b[3]\n"
+    "ldr q8, [%x[params], #0x30]\n"
+    ".inst 0x4f9de13a  // sdot v26.4s, v9.16b, v29.4b[0]\n"
+    ".inst 0x4fbde121  // sdot v1.4s, v9.16b, v29.4b[1]\n"
+    ".inst 0x4f9de936  // sdot v22.4s, v9.16b, v29.4b[2]\n"
+    ".inst 0x4fbde939  // sdot v25.4s, v9.16b, v29.4b[3]\n"
+    ".inst 0x4f82e12b  // sdot v11.4s, v9.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e123  // sdot v3.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x4f82e933  // sdot v19.4s, v9.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e935  // sdot v21.4s, v9.16b, v2.4b[3]\n"
+    "ldr q9, [%x[params], #0x40]\n"
+    ".inst 0x4f80e15a  // sdot v26.4s, v10.16b, v0.4b[0]\n"
+    ".inst 0x4fa0e141  // sdot v1.4s, v10.16b, v0.4b[1]\n"
+    ".inst 0x4f80e956  // sdot v22.4s, v10.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e959  // sdot v25.4s, v10.16b, v0.4b[3]\n"
+    ".inst 0x4f87e14b  // sdot v11.4s, v10.16b, v7.4b[0]\n"
+    ".inst 0x4fa7e143  // sdot v3.4s, v10.16b, v7.4b[1]\n"
+    ".inst 0x4f87e953  // sdot v19.4s, v10.16b, v7.4b[2]\n"
+    ".inst 0x4fa7e955  // sdot v21.4s, v10.16b, v7.4b[3]\n"
+    "ldr q10, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+    "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+    "and v30.16b, v26.16b, v4.16b\n"
+    "and v17.16b, v1.16b, v4.16b\n"
+    "and v16.16b, v22.16b, v4.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v30.4s\n"
+    "sqadd v1.4s, v1.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v25.16b, v4.16b\n"
+    "srshl v26.4s, v26.4s, v4.4s\n"
+    "srshl v1.4s, v1.4s, v4.4s\n"
+    "srshl v22.4s, v22.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v1.4s, v1.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v23.4s\n"
+    "smin v1.4s, v1.4s, v23.4s\n"
+    "smin v22.4s, v22.4s, v23.4s\n"
+    "smax v26.4s, v26.4s, v27.4s\n"
+    "smax v1.4s, v1.4s, v27.4s\n"
+    "smax v22.4s, v22.4s, v27.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v1.16b, v1.16b, v1.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x26, x10]\n"
+    "uzp1 v1.16b, v1.16b, v1.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "ldr q26, [SP, #0x0]\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "str s1, [x25, x10]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "ldr q1, [SP, #0x10]\n"
+    "and v16.16b, v11.16b, v4.16b\n"
+    "str s22, [x24, x10]\n"
+    "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+    "ldr q22, [SP, #0x20]\n"
+    "srshl v25.4s, v25.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+    "and v17.16b, v3.16b, v4.16b\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "sqadd v11.4s, v11.4s, v16.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v25.4s, v25.4s, v23.4s\n"
+    "and v16.16b, v19.16b, v4.16b\n"
+    "srshl v11.4s, v11.4s, v4.4s\n"
+    "smax v25.4s, v25.4s, v27.4s\n"
+    "sqadd v3.4s, v3.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v11.4s, v11.4s, v14.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x23, x10]\n"
+    "smin v11.4s, v11.4s, v23.4s\n"
+    "srshl v3.4s, v3.4s, v4.4s\n"
+    "ldr q25, [SP, #0x30]\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "smax v11.4s, v11.4s, v27.4s\n"
+    "add v3.4s, v3.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v4.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "smin v3.4s, v3.4s, v23.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str s11, [x22, x10]\n"
+    "smax v3.4s, v3.4s, v27.4s\n"
+    "add v19.4s, v19.4s, v14.4s\n"
+    "ldr q11, [SP, #0x40]\n"
+    "and v16.16b, v21.16b, v4.16b\n"
+    "add v26.4s, v26.4s, v6.4s\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "smin v19.4s, v19.4s, v23.4s\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "str s3, [x21, x10]\n"
+    "smax v19.4s, v19.4s, v27.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr q3, [SP, #0x50]\n"
+    "add v1.4s, v1.4s, v6.4s\n"
+    "add v22.4s, v22.4s, v6.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x20, x10]\n"
+    "add v25.4s, v25.4s, v6.4s\n"
+    "add v11.4s, v11.4s, v6.4s\n"
+    "ldr q19, [SP, #0x60]\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "add v3.4s, v3.4s, v6.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v19.4s, v19.4s, v6.4s\n"
+    "smin v21.4s, v21.4s, v23.4s\n"
+    "smax v21.4s, v21.4s, v27.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x19, x10]\n"
+    "add x10, x10, #0x4\n"
+    "ldr q21, [SP, #0x70]\n"
+    "add v21.4s, v21.4s, v6.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    ".inst 0x4f8fe11a  // sdot v26.4s, v8.16b, v15.4b[0]\n"
+    "ldr q20, [%x[params], #0x0]\n"
+    "add x26, x26, x10\n"
+    ".inst 0x4fafe101  // sdot v1.4s, v8.16b, v15.4b[1]\n"
+    "ldr q4, [%x[params], #0x10]\n"
+    "add x25, x25, x10\n"
+    ".inst 0x4f8fe916  // sdot v22.4s, v8.16b, v15.4b[2]\n"
+    "add x24, x24, x10\n"
+    ".inst 0x4fafe919  // sdot v25.4s, v8.16b, v15.4b[3]\n"
+    "add x23, x23, x10\n"
+    ".inst 0x4f80e10b  // sdot v11.4s, v8.16b, v0.4b[0]\n"
+    "add x22, x22, x10\n"
+    ".inst 0x4fa0e103  // sdot v3.4s, v8.16b, v0.4b[1]\n"
+    "add x21, x21, x10\n"
+    ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+    "add x20, x20, x10\n"
+    ".inst 0x4fa0e915  // sdot v21.4s, v8.16b, v0.4b[3]\n"
+    "add x19, x19, x10\n"
+    ".inst 0x4f9de13a  // sdot v26.4s, v9.16b, v29.4b[0]\n"
+    "cmp %x[n_channels], #0x4\n"
+    ".inst 0x4fbde121  // sdot v1.4s, v9.16b, v29.4b[1]\n"
+    "add %x[params], %x[params], #0x20\n"
+    ".inst 0x4f9de936  // sdot v22.4s, v9.16b, v29.4b[2]\n"
+    ".inst 0x4fbde939  // sdot v25.4s, v9.16b, v29.4b[3]\n"
+    ".inst 0x4f82e12b  // sdot v11.4s, v9.16b, v2.4b[0]\n"
+    ".inst 0x4fa2e123  // sdot v3.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x4f82e933  // sdot v19.4s, v9.16b, v2.4b[2]\n"
+    ".inst 0x4fa2e935  // sdot v21.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x4f80e15a  // sdot v26.4s, v10.16b, v0.4b[0]\n"
+    ".inst 0x4fa0e141  // sdot v1.4s, v10.16b, v0.4b[1]\n"
+    ".inst 0x4f80e956  // sdot v22.4s, v10.16b, v0.4b[2]\n"
+    ".inst 0x4fa0e959  // sdot v25.4s, v10.16b, v0.4b[3]\n"
+    ".inst 0x4f87e14b  // sdot v11.4s, v10.16b, v7.4b[0]\n"
+    ".inst 0x4fa7e143  // sdot v3.4s, v10.16b, v7.4b[1]\n"
+    ".inst 0x4f87e953  // sdot v19.4s, v10.16b, v7.4b[2]\n"
+    ".inst 0x4fa7e955  // sdot v21.4s, v10.16b, v7.4b[3]\n"
+    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+    "and v30.16b, v26.16b, v4.16b\n"
+    "and v17.16b, v1.16b, v4.16b\n"
+    "and v16.16b, v22.16b, v4.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v30.4s\n"
+    "sqadd v1.4s, v1.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v25.16b, v4.16b\n"
+    "srshl v26.4s, v26.4s, v4.4s\n"
+    "srshl v1.4s, v1.4s, v4.4s\n"
+    "srshl v22.4s, v22.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v1.4s, v1.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v23.4s\n"
+    "smin v1.4s, v1.4s, v23.4s\n"
+    "smin v22.4s, v22.4s, v23.4s\n"
+    "smax v26.4s, v26.4s, v27.4s\n"
+    "smax v1.4s, v1.4s, v27.4s\n"
+    "smax v22.4s, v22.4s, v27.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v1.16b, v1.16b, v1.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v1.16b, v1.16b, v1.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+    "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v4.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+    "and v16.16b, v11.16b, v4.16b\n"
+    "and v17.16b, v3.16b, v4.16b\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v25.4s, v25.4s, v23.4s\n"
+    "sqadd v11.4s, v11.4s, v16.4s\n"
+    "sqadd v3.4s, v3.4s, v17.4s\n"
+    "smax v25.4s, v25.4s, v27.4s\n"
+    "and v16.16b, v19.16b, v4.16b\n"
+    "srshl v11.4s, v11.4s, v4.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "srshl v3.4s, v3.4s, v4.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v11.4s, v11.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v3.4s, v3.4s, v14.4s\n"
+    "smin v11.4s, v11.4s, v23.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smin v3.4s, v3.4s, v23.4s\n"
+    "smax v11.4s, v11.4s, v27.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "smax v3.4s, v3.4s, v27.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "srshl v19.4s, v19.4s, v4.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "and v16.16b, v21.16b, v4.16b\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "add v19.4s, v19.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v19.4s, v19.4s, v23.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v27.4s\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v23.4s\n"
+    "smax v21.4s, v21.4s, v27.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "blt 3f\n"
+    "str s26, [x26, #0x0]\n"
+    "str s1, [x25, #0x0]\n"
+    "str s22, [x24, #0x0]\n"
+    "str s25, [x23, #0x0]\n"
+    "str s11, [x22, #0x0]\n"
+    "str s3, [x21, #0x0]\n"
+    "str s19, [x20, #0x0]\n"
+    "str s21, [x19, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "st1 { v26.b }[0], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v1.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v25.b }[0], [x23], #0x1\n"
+    "st1 { v11.b }[0], [x22], #0x1\n"
+    "st1 { v3.b }[0], [x21], #0x1\n"
+    "st1 { v19.b }[0], [x20], #0x1\n"
+    "st1 { v21.b }[0], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v26.b }[1], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v1.b }[1], [x25], #0x1\n"
+    "st1 { v22.b }[1], [x24], #0x1\n"
+    "st1 { v25.b }[1], [x23], #0x1\n"
+    "st1 { v11.b }[1], [x22], #0x1\n"
+    "st1 { v3.b }[1], [x21], #0x1\n"
+    "st1 { v19.b }[1], [x20], #0x1\n"
+    "st1 { v21.b }[1], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v26.b }[2], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v1.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v25.b }[2], [x23], #0x1\n"
+    "st1 { v11.b }[2], [x22], #0x1\n"
+    "st1 { v3.b }[2], [x21], #0x1\n"
+    "st1 { v19.b }[2], [x20], #0x1\n"
+    "st1 { v21.b }[2], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v26.b }[3], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v1.b }[3], [x25], #0x1\n"
+    "st1 { v22.b }[3], [x24], #0x1\n"
+    "st1 { v25.b }[3], [x23], #0x1\n"
+    "st1 { v11.b }[3], [x22], #0x1\n"
+    "st1 { v3.b }[3], [x21], #0x1\n"
+    "st1 { v19.b }[3], [x20], #0x1\n"
+    "st1 { v21.b }[3], [x19], #0x1\n"
+    "4:"  // Tail: End
+    "add SP, SP, #0x80\n"
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..9a3eed4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 8;
+  constexpr static unsigned int input_cols = 6;
+  constexpr static unsigned int input_col_quads = 1;
+
+  kern_type kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+
+  a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..95ad78c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "movi v15.16b, #0x1\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "add SP, SP, #-0x80\n"
+    "movi v14.4s, #0x1\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "add x22, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "movi v28.4s, #0x0\n"
+    "ldr x19, [%x[inptrs], #0x10]\n"
+    "mov x11, #0x0\n"
+    "movi v27.4s, #0x0\n"
+    "ld1 { v13.16b }, [x21]\n"
+    "mov x10, #0x0\n"
+    "movi v26.4s, #0x0\n"
+    "ld1 { v12.16b }, [x20]\n"
+    "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "movi v25.4s, #0x0\n"
+    "ld1 { v7.16b }, [x19]\n"
+    "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "movi v24.4s, #0x0\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "mov v18.16b, v13.16b\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+    "ldr x19, [%x[inptrs], #0x28]\n"
+    "mov v17.16b, v12.16b\n"
+    "ld1 { v6.16b }, [x21]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+    "ld1 { v5.16b }, [x20]\n"
+    "mov v16.16b, v7.16b\n"
+    "ld1 { v4.16b }, [x19]\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "zip1 v13.2d, v13.2d, v18.2d\n"
+    "ldr x19, [%x[inptrs], #0x38]\n"
+    "zip1 v12.2d, v12.2d, v17.2d\n"
+    "ld1r { v3.4s }, [x22]\n"
+    "mov v18.16b, v6.16b\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "zip1 v7.2d, v7.2d, v16.2d\n"
+    "ld1 { v1.16b }, [x19]\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+    "ldp x26, x25, [%x[outptrs], #0x0]\n"
+    "mov v17.16b, v5.16b\n"
+    "ldp x24, x23, [%x[outptrs], #0x10]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+    "ldp x22, x21, [%x[outptrs], #0x20]\n"
+    "mov v16.16b, v4.16b\n"
+    "ldp x20, x19, [%x[outptrs], #0x30]\n"
+    "zip1 v6.2d, v6.2d, v18.2d\n"
+    "ld1r { v0.4s }, [x9]\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "ld1r { v31.4s }, [x28]\n"
+    "zip1 v5.2d, v5.2d, v17.2d\n"
+    "ld1r { v30.4s }, [x27]\n"
+    "mov v17.16b, v2.16b\n"
+    "ldr q29, [%x[params], #0x0]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+    "ldr q8, [%x[params], #0x10]\n"
+    "zip1 v4.2d, v4.2d, v16.2d\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "mov v16.16b, v1.16b\n"
+    "ldr q10, [%x[params], #0x30]\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "ldr q11, [%x[params], #0x40]\n"
+    "add %x[params], %x[params], #0x50\n"
+    "zip1 v2.2d, v2.2d, v17.2d\n"
+    "movi v23.4s, #0x0\n"
+    "movi v22.4s, #0x0\n"
+    "zip1 v1.2d, v1.2d, v16.2d\n"
+    "movi v21.4s, #0x0\n"
+    "movi v18.4s, #0x0\n"
+    "movi v17.4s, #0x0\n"
+    "movi v16.4s, #0x0\n"
+    "movi v20.4s, #0x0\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x4f8de1fc  // sdot v28.4s, v15.16b, v13.4b[0]\n"
+    ".inst 0x4f8de9fb  // sdot v27.4s, v15.16b, v13.4b[2]\n"
+    ".inst 0x4f8ce1fa  // sdot v26.4s, v15.16b, v12.4b[0]\n"
+    ".inst 0x4f8ce9f9  // sdot v25.4s, v15.16b, v12.4b[2]\n"
+    ".inst 0x4fade1dc  // sdot v28.4s, v14.16b, v13.4b[1]\n"
+    ".inst 0x4fade9db  // sdot v27.4s, v14.16b, v13.4b[3]\n"
+    ".inst 0x4face1da  // sdot v26.4s, v14.16b, v12.4b[1]\n"
+    ".inst 0x4face9d9  // sdot v25.4s, v14.16b, v12.4b[3]\n"
+    ".inst 0x4f87e1f8  // sdot v24.4s, v15.16b, v7.4b[0]\n"
+    ".inst 0x4f87e9f7  // sdot v23.4s, v15.16b, v7.4b[2]\n"
+    ".inst 0x4f86e1f6  // sdot v22.4s, v15.16b, v6.4b[0]\n"
+    ".inst 0x4f86e9f5  // sdot v21.4s, v15.16b, v6.4b[2]\n"
+    ".inst 0x4fa7e1d8  // sdot v24.4s, v14.16b, v7.4b[1]\n"
+    ".inst 0x4fa7e9d7  // sdot v23.4s, v14.16b, v7.4b[3]\n"
+    ".inst 0x4fa6e1d6  // sdot v22.4s, v14.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e9d5  // sdot v21.4s, v14.16b, v6.4b[3]\n"
+    ".inst 0x4f85e1f2  // sdot v18.4s, v15.16b, v5.4b[0]\n"
+    ".inst 0x4f85e9f1  // sdot v17.4s, v15.16b, v5.4b[2]\n"
+    ".inst 0x4f84e1f0  // sdot v16.4s, v15.16b, v4.4b[0]\n"
+    ".inst 0x4f84e9f4  // sdot v20.4s, v15.16b, v4.4b[2]\n"
+    ".inst 0x4fa5e1d2  // sdot v18.4s, v14.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e9d1  // sdot v17.4s, v14.16b, v5.4b[3]\n"
+    ".inst 0x4fa4e1d0  // sdot v16.4s, v14.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e9d4  // sdot v20.4s, v14.16b, v4.4b[3]\n"
+    ".inst 0x4f82e1f3  // sdot v19.4s, v15.16b, v2.4b[0]\n"
+    "mov v28.16b, v28.16b\n"
+    "mov v27.16b, v27.16b\n"
+    "add v28.4s, v28.4s, v26.4s\n"
+    ".inst 0x4fa2e1d3  // sdot v19.4s, v14.16b, v2.4b[1]\n"
+    "add v27.4s, v27.4s, v25.4s\n"
+    "add v28.4s, v28.4s, v24.4s\n"
+    "mov v26.16b, v26.16b\n"
+    "add v27.4s, v27.4s, v23.4s\n"
+    "add v28.4s, v28.4s, v22.4s\n"
+    "mov v25.16b, v25.16b\n"
+    "add v27.4s, v27.4s, v21.4s\n"
+    "add v28.4s, v28.4s, v18.4s\n"
+    "add v26.4s, v26.4s, v24.4s\n"
+    "add v27.4s, v27.4s, v17.4s\n"
+    "add v25.4s, v25.4s, v23.4s\n"
+    "add v26.4s, v26.4s, v22.4s\n"
+    "mov v24.16b, v24.16b\n"
+    "add v25.4s, v25.4s, v21.4s\n"
+    "add v26.4s, v26.4s, v18.4s\n"
+    "mov v23.16b, v23.16b\n"
+    "add v25.4s, v25.4s, v17.4s\n"
+    "add v26.4s, v26.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v22.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v23.4s, v23.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v18.4s\n"
+    "mov v22.16b, v22.16b\n"
+    "add v23.4s, v23.4s, v17.4s\n"
+    "add v24.4s, v24.4s, v16.4s\n"
+    "mov v21.16b, v21.16b\n"
+    "add v23.4s, v23.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v19.4s\n"
+    "add v22.4s, v22.4s, v18.4s\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x4f82e9f2  // sdot v18.4s, v15.16b, v2.4b[2]\n"
+    "add v21.4s, v21.4s, v17.4s\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x4f81e1f1  // sdot v17.4s, v15.16b, v1.4b[0]\n"
+    ".inst 0x4fa2e9d2  // sdot v18.4s, v14.16b, v2.4b[3]\n"
+    "add v22.4s, v22.4s, v16.4s\n"
+    "movi v16.4s, #0x0\n"
+    ".inst 0x4fa1e1d1  // sdot v17.4s, v14.16b, v1.4b[1]\n"
+    ".inst 0x4f81e9f0  // sdot v16.4s, v15.16b, v1.4b[2]\n"
+    "add v23.4s, v23.4s, v18.4s\n"
+    "add v21.4s, v21.4s, v20.4s\n"
+    "add v22.4s, v22.4s, v19.4s\n"
+    ".inst 0x4fa1e9d0  // sdot v16.4s, v14.16b, v1.4b[3]\n"
+    "add v21.4s, v21.4s, v18.4s\n"
+    "add v22.4s, v22.4s, v17.4s\n"
+    "neg v3.4s, v3.4s\n"
+    "add v21.4s, v21.4s, v16.4s\n"
+    "mul v28.4s, v28.4s, v3.4s\n"
+    "str q28, [SP, #0x0]\n"
+    "mul v27.4s, v27.4s, v3.4s\n"
+    "mul v26.4s, v26.4s, v3.4s\n"
+    "str q27, [SP, #0x10]\n"
+    "mul v25.4s, v25.4s, v3.4s\n"
+    "mul v24.4s, v24.4s, v3.4s\n"
+    "str q26, [SP, #0x20]\n"
+    "mul v23.4s, v23.4s, v3.4s\n"
+    "str q25, [SP, #0x30]\n"
+    "mul v22.4s, v22.4s, v3.4s\n"
+    "mul v21.4s, v21.4s, v3.4s\n"
+    "str q24, [SP, #0x40]\n"
+    "add v28.4s, v28.4s, v29.4s\n"
+    "str q23, [SP, #0x50]\n"
+    "add v27.4s, v27.4s, v29.4s\n"
+    "str q22, [SP, #0x60]\n"
+    "add v26.4s, v26.4s, v29.4s\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q21, [SP, #0x70]\n"
+    "add v24.4s, v24.4s, v29.4s\n"
+    "add v23.4s, v23.4s, v29.4s\n"
+    "add v22.4s, v22.4s, v29.4s\n"
+    "add v21.4s, v21.4s, v29.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    ".inst 0x4f8de11c  // sdot v28.4s, v8.16b, v13.4b[0]\n"
+    "ldr q20, [%x[params], #0x60]\n"
+    "add x11, x11, #0x10\n"
+    ".inst 0x4f8de91b  // sdot v27.4s, v8.16b, v13.4b[2]\n"
+    "ldr q19, [%x[params], #0x70]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x4f8ce11a  // sdot v26.4s, v8.16b, v12.4b[0]\n"
+    "ldr q29, [%x[params], #0x80]\n"
+    "cmp %x[n_channels], #0x4\n"
+    ".inst 0x4f8ce919  // sdot v25.4s, v8.16b, v12.4b[2]\n"
+    ".inst 0x4f87e118  // sdot v24.4s, v8.16b, v7.4b[0]\n"
+    ".inst 0x4f87e917  // sdot v23.4s, v8.16b, v7.4b[2]\n"
+    ".inst 0x4f86e116  // sdot v22.4s, v8.16b, v6.4b[0]\n"
+    ".inst 0x4f86e915  // sdot v21.4s, v8.16b, v6.4b[2]\n"
+    "ldr q8, [%x[params], #0x0]\n"
+    ".inst 0x4fade13c  // sdot v28.4s, v9.16b, v13.4b[1]\n"
+    ".inst 0x4fade93b  // sdot v27.4s, v9.16b, v13.4b[3]\n"
+    ".inst 0x4face13a  // sdot v26.4s, v9.16b, v12.4b[1]\n"
+    ".inst 0x4face939  // sdot v25.4s, v9.16b, v12.4b[3]\n"
+    ".inst 0x4fa7e138  // sdot v24.4s, v9.16b, v7.4b[1]\n"
+    ".inst 0x4fa7e937  // sdot v23.4s, v9.16b, v7.4b[3]\n"
+    ".inst 0x4fa6e136  // sdot v22.4s, v9.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e935  // sdot v21.4s, v9.16b, v6.4b[3]\n"
+    "ldr q9, [%x[params], #0x10]\n"
+    ".inst 0x4f8ce15c  // sdot v28.4s, v10.16b, v12.4b[0]\n"
+    ".inst 0x4f8ce95b  // sdot v27.4s, v10.16b, v12.4b[2]\n"
+    ".inst 0x4f87e15a  // sdot v26.4s, v10.16b, v7.4b[0]\n"
+    ".inst 0x4f87e959  // sdot v25.4s, v10.16b, v7.4b[2]\n"
+    ".inst 0x4f86e158  // sdot v24.4s, v10.16b, v6.4b[0]\n"
+    ".inst 0x4f86e957  // sdot v23.4s, v10.16b, v6.4b[2]\n"
+    ".inst 0x4f85e156  // sdot v22.4s, v10.16b, v5.4b[0]\n"
+    ".inst 0x4f85e955  // sdot v21.4s, v10.16b, v5.4b[2]\n"
+    "ldr q10, [%x[params], #0x20]\n"
+    ".inst 0x4face17c  // sdot v28.4s, v11.16b, v12.4b[1]\n"
+    ".inst 0x4face97b  // sdot v27.4s, v11.16b, v12.4b[3]\n"
+    ".inst 0x4fa7e17a  // sdot v26.4s, v11.16b, v7.4b[1]\n"
+    ".inst 0x4fa7e979  // sdot v25.4s, v11.16b, v7.4b[3]\n"
+    ".inst 0x4fa6e178  // sdot v24.4s, v11.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e977  // sdot v23.4s, v11.16b, v6.4b[3]\n"
+    ".inst 0x4fa5e176  // sdot v22.4s, v11.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e975  // sdot v21.4s, v11.16b, v5.4b[3]\n"
+    "ldr q11, [%x[params], #0x30]\n"
+    ".inst 0x4f87e11c  // sdot v28.4s, v8.16b, v7.4b[0]\n"
+    ".inst 0x4f87e91b  // sdot v27.4s, v8.16b, v7.4b[2]\n"
+    ".inst 0x4f86e11a  // sdot v26.4s, v8.16b, v6.4b[0]\n"
+    ".inst 0x4f86e919  // sdot v25.4s, v8.16b, v6.4b[2]\n"
+    ".inst 0x4f85e118  // sdot v24.4s, v8.16b, v5.4b[0]\n"
+    ".inst 0x4f85e917  // sdot v23.4s, v8.16b, v5.4b[2]\n"
+    ".inst 0x4f84e116  // sdot v22.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x4f84e915  // sdot v21.4s, v8.16b, v4.4b[2]\n"
+    "ldr q8, [%x[params], #0x40]\n"
+    ".inst 0x4fa7e13c  // sdot v28.4s, v9.16b, v7.4b[1]\n"
+    ".inst 0x4fa7e93b  // sdot v27.4s, v9.16b, v7.4b[3]\n"
+    ".inst 0x4fa6e13a  // sdot v26.4s, v9.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e939  // sdot v25.4s, v9.16b, v6.4b[3]\n"
+    ".inst 0x4fa5e138  // sdot v24.4s, v9.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e937  // sdot v23.4s, v9.16b, v5.4b[3]\n"
+    ".inst 0x4fa4e136  // sdot v22.4s, v9.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e935  // sdot v21.4s, v9.16b, v4.4b[3]\n"
+    "ldr q9, [%x[params], #0x50]\n"
+    ".inst 0x4f86e15c  // sdot v28.4s, v10.16b, v6.4b[0]\n"
+    ".inst 0x4f86e95b  // sdot v27.4s, v10.16b, v6.4b[2]\n"
+    ".inst 0x4f85e15a  // sdot v26.4s, v10.16b, v5.4b[0]\n"
+    ".inst 0x4f85e959  // sdot v25.4s, v10.16b, v5.4b[2]\n"
+    ".inst 0x4f84e158  // sdot v24.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x4f84e957  // sdot v23.4s, v10.16b, v4.4b[2]\n"
+    ".inst 0x4f82e156  // sdot v22.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x4f82e955  // sdot v21.4s, v10.16b, v2.4b[2]\n"
+    "ldr q10, [%x[params], #0xb0]\n"
+    ".inst 0x4fa6e17c  // sdot v28.4s, v11.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e97b  // sdot v27.4s, v11.16b, v6.4b[3]\n"
+    ".inst 0x4fa5e17a  // sdot v26.4s, v11.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e979  // sdot v25.4s, v11.16b, v5.4b[3]\n"
+    ".inst 0x4fa4e178  // sdot v24.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e977  // sdot v23.4s, v11.16b, v4.4b[3]\n"
+    ".inst 0x4fa2e176  // sdot v22.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e975  // sdot v21.4s, v11.16b, v2.4b[3]\n"
+    "ldr q11, [%x[params], #0xc0]\n"
+    ".inst 0x4f85e11c  // sdot v28.4s, v8.16b, v5.4b[0]\n"
+    ".inst 0x4f85e91b  // sdot v27.4s, v8.16b, v5.4b[2]\n"
+    ".inst 0x4f84e11a  // sdot v26.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x4f84e919  // sdot v25.4s, v8.16b, v4.4b[2]\n"
+    ".inst 0x4f82e118  // sdot v24.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x4f82e917  // sdot v23.4s, v8.16b, v2.4b[2]\n"
+    ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x4f81e915  // sdot v21.4s, v8.16b, v1.4b[2]\n"
+    "ldr q8, [%x[params], #0x90]\n"
+    ".inst 0x4fa5e13c  // sdot v28.4s, v9.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e93b  // sdot v27.4s, v9.16b, v5.4b[3]\n"
+    ".inst 0x4fa4e13a  // sdot v26.4s, v9.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e939  // sdot v25.4s, v9.16b, v4.4b[3]\n"
+    ".inst 0x4fa2e138  // sdot v24.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e937  // sdot v23.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x4fa1e136  // sdot v22.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e935  // sdot v21.4s, v9.16b, v1.4b[3]\n"
+    "ldr q9, [%x[params], #0xa0]\n"
+    "add %x[params], %x[params], #0xd0\n"
+    "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v20.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v20.4s\n"
+    "and v18.16b, v28.16b, v19.16b\n"
+    "and v17.16b, v27.16b, v19.16b\n"
+    "and v16.16b, v26.16b, v19.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v18.4s\n"
+    "sqadd v27.4s, v27.4s, v17.4s\n"
+    "sqadd v26.4s, v26.4s, v16.4s\n"
+    "and v16.16b, v25.16b, v19.16b\n"
+    "srshl v28.4s, v28.4s, v19.4s\n"
+    "srshl v27.4s, v27.4s, v19.4s\n"
+    "srshl v26.4s, v26.4s, v19.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v28.4s, v28.4s, v0.4s\n"
+    "add v27.4s, v27.4s, v0.4s\n"
+    "add v26.4s, v26.4s, v0.4s\n"
+    "smin v28.4s, v28.4s, v30.4s\n"
+    "smin v27.4s, v27.4s, v30.4s\n"
+    "smin v26.4s, v26.4s, v30.4s\n"
+    "smax v28.4s, v28.4s, v31.4s\n"
+    "smax v27.4s, v27.4s, v31.4s\n"
+    "smax v26.4s, v26.4s, v31.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x26, x10]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "ldr q28, [SP, #0x0]\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "str s27, [x25, x10]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "ldr q27, [SP, #0x10]\n"
+    "and v16.16b, v24.16b, v19.16b\n"
+    "str s26, [x24, x10]\n"
+    "sqrdmulh v23.4s, v23.4s, v20.4s\n"
+    "ldr q26, [SP, #0x20]\n"
+    "srshl v25.4s, v25.4s, v19.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "and v17.16b, v23.16b, v19.16b\n"
+    "add v25.4s, v25.4s, v0.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v25.4s, v25.4s, v30.4s\n"
+    "and v16.16b, v22.16b, v19.16b\n"
+    "srshl v24.4s, v24.4s, v19.4s\n"
+    "smax v25.4s, v25.4s, v31.4s\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v0.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x23, x10]\n"
+    "smin v24.4s, v24.4s, v30.4s\n"
+    "srshl v23.4s, v23.4s, v19.4s\n"
+    "ldr q25, [SP, #0x30]\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "smax v24.4s, v24.4s, v31.4s\n"
+    "add v23.4s, v23.4s, v0.4s\n"
+    "srshl v22.4s, v22.4s, v19.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "smin v23.4s, v23.4s, v30.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x22, x10]\n"
+    "smax v23.4s, v23.4s, v31.4s\n"
+    "add v22.4s, v22.4s, v0.4s\n"
+    "ldr q24, [SP, #0x40]\n"
+    "and v16.16b, v21.16b, v19.16b\n"
+    "add v28.4s, v28.4s, v29.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smin v22.4s, v22.4s, v30.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x21, x10]\n"
+    "smax v22.4s, v22.4s, v31.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr q23, [SP, #0x50]\n"
+    "add v27.4s, v27.4s, v29.4s\n"
+    "add v26.4s, v26.4s, v29.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x20, x10]\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "add v24.4s, v24.4s, v29.4s\n"
+    "ldr q22, [SP, #0x60]\n"
+    "srshl v21.4s, v21.4s, v19.4s\n"
+    "add v23.4s, v23.4s, v29.4s\n"
+    "add v21.4s, v21.4s, v0.4s\n"
+    "add v22.4s, v22.4s, v29.4s\n"
+    "smin v21.4s, v21.4s, v30.4s\n"
+    "smax v21.4s, v21.4s, v31.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x19, x10]\n"
+    "add x10, x10, #0x4\n"
+    "ldr q21, [SP, #0x70]\n"
+    "add v21.4s, v21.4s, v29.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    ".inst 0x4f8de11c  // sdot v28.4s, v8.16b, v13.4b[0]\n"
+    "ldr q20, [%x[params], #0x60]\n"
+    "add x26, x26, x10\n"
+    ".inst 0x4f8de91b  // sdot v27.4s, v8.16b, v13.4b[2]\n"
+    "ldr q19, [%x[params], #0x70]\n"
+    "add x25, x25, x10\n"
+    ".inst 0x4f8ce11a  // sdot v26.4s, v8.16b, v12.4b[0]\n"
+    "add x24, x24, x10\n"
+    ".inst 0x4f8ce919  // sdot v25.4s, v8.16b, v12.4b[2]\n"
+    "add x23, x23, x10\n"
+    ".inst 0x4f87e118  // sdot v24.4s, v8.16b, v7.4b[0]\n"
+    "add x22, x22, x10\n"
+    ".inst 0x4f87e917  // sdot v23.4s, v8.16b, v7.4b[2]\n"
+    "add x21, x21, x10\n"
+    ".inst 0x4f86e116  // sdot v22.4s, v8.16b, v6.4b[0]\n"
+    "add x20, x20, x10\n"
+    ".inst 0x4f86e915  // sdot v21.4s, v8.16b, v6.4b[2]\n"
+    "ldr q8, [%x[params], #0x0]\n"
+    "add x19, x19, x10\n"
+    ".inst 0x4fade13c  // sdot v28.4s, v9.16b, v13.4b[1]\n"
+    "cmp %x[n_channels], #0x4\n"
+    ".inst 0x4fade93b  // sdot v27.4s, v9.16b, v13.4b[3]\n"
+    ".inst 0x4face13a  // sdot v26.4s, v9.16b, v12.4b[1]\n"
+    ".inst 0x4face939  // sdot v25.4s, v9.16b, v12.4b[3]\n"
+    ".inst 0x4fa7e138  // sdot v24.4s, v9.16b, v7.4b[1]\n"
+    ".inst 0x4fa7e937  // sdot v23.4s, v9.16b, v7.4b[3]\n"
+    ".inst 0x4fa6e136  // sdot v22.4s, v9.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e935  // sdot v21.4s, v9.16b, v6.4b[3]\n"
+    "ldr q9, [%x[params], #0x10]\n"
+    ".inst 0x4f8ce15c  // sdot v28.4s, v10.16b, v12.4b[0]\n"
+    ".inst 0x4f8ce95b  // sdot v27.4s, v10.16b, v12.4b[2]\n"
+    ".inst 0x4f87e15a  // sdot v26.4s, v10.16b, v7.4b[0]\n"
+    ".inst 0x4f87e959  // sdot v25.4s, v10.16b, v7.4b[2]\n"
+    ".inst 0x4f86e158  // sdot v24.4s, v10.16b, v6.4b[0]\n"
+    ".inst 0x4f86e957  // sdot v23.4s, v10.16b, v6.4b[2]\n"
+    ".inst 0x4f85e156  // sdot v22.4s, v10.16b, v5.4b[0]\n"
+    ".inst 0x4f85e955  // sdot v21.4s, v10.16b, v5.4b[2]\n"
+    "ldr q10, [%x[params], #0x20]\n"
+    ".inst 0x4face17c  // sdot v28.4s, v11.16b, v12.4b[1]\n"
+    ".inst 0x4face97b  // sdot v27.4s, v11.16b, v12.4b[3]\n"
+    ".inst 0x4fa7e17a  // sdot v26.4s, v11.16b, v7.4b[1]\n"
+    ".inst 0x4fa7e979  // sdot v25.4s, v11.16b, v7.4b[3]\n"
+    ".inst 0x4fa6e178  // sdot v24.4s, v11.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e977  // sdot v23.4s, v11.16b, v6.4b[3]\n"
+    ".inst 0x4fa5e176  // sdot v22.4s, v11.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e975  // sdot v21.4s, v11.16b, v5.4b[3]\n"
+    "ldr q11, [%x[params], #0x30]\n"
+    ".inst 0x4f87e11c  // sdot v28.4s, v8.16b, v7.4b[0]\n"
+    ".inst 0x4f87e91b  // sdot v27.4s, v8.16b, v7.4b[2]\n"
+    ".inst 0x4f86e11a  // sdot v26.4s, v8.16b, v6.4b[0]\n"
+    ".inst 0x4f86e919  // sdot v25.4s, v8.16b, v6.4b[2]\n"
+    ".inst 0x4f85e118  // sdot v24.4s, v8.16b, v5.4b[0]\n"
+    ".inst 0x4f85e917  // sdot v23.4s, v8.16b, v5.4b[2]\n"
+    ".inst 0x4f84e116  // sdot v22.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x4f84e915  // sdot v21.4s, v8.16b, v4.4b[2]\n"
+    "ldr q8, [%x[params], #0x40]\n"
+    ".inst 0x4fa7e13c  // sdot v28.4s, v9.16b, v7.4b[1]\n"
+    ".inst 0x4fa7e93b  // sdot v27.4s, v9.16b, v7.4b[3]\n"
+    ".inst 0x4fa6e13a  // sdot v26.4s, v9.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e939  // sdot v25.4s, v9.16b, v6.4b[3]\n"
+    ".inst 0x4fa5e138  // sdot v24.4s, v9.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e937  // sdot v23.4s, v9.16b, v5.4b[3]\n"
+    ".inst 0x4fa4e136  // sdot v22.4s, v9.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e935  // sdot v21.4s, v9.16b, v4.4b[3]\n"
+    "ldr q9, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x80\n"
+    ".inst 0x4f86e15c  // sdot v28.4s, v10.16b, v6.4b[0]\n"
+    ".inst 0x4f86e95b  // sdot v27.4s, v10.16b, v6.4b[2]\n"
+    ".inst 0x4f85e15a  // sdot v26.4s, v10.16b, v5.4b[0]\n"
+    ".inst 0x4f85e959  // sdot v25.4s, v10.16b, v5.4b[2]\n"
+    ".inst 0x4f84e158  // sdot v24.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x4f84e957  // sdot v23.4s, v10.16b, v4.4b[2]\n"
+    ".inst 0x4f82e156  // sdot v22.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x4f82e955  // sdot v21.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x4fa6e17c  // sdot v28.4s, v11.16b, v6.4b[1]\n"
+    ".inst 0x4fa6e97b  // sdot v27.4s, v11.16b, v6.4b[3]\n"
+    ".inst 0x4fa5e17a  // sdot v26.4s, v11.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e979  // sdot v25.4s, v11.16b, v5.4b[3]\n"
+    ".inst 0x4fa4e178  // sdot v24.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e977  // sdot v23.4s, v11.16b, v4.4b[3]\n"
+    ".inst 0x4fa2e176  // sdot v22.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e975  // sdot v21.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x4f85e11c  // sdot v28.4s, v8.16b, v5.4b[0]\n"
+    ".inst 0x4f85e91b  // sdot v27.4s, v8.16b, v5.4b[2]\n"
+    ".inst 0x4f84e11a  // sdot v26.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x4f84e919  // sdot v25.4s, v8.16b, v4.4b[2]\n"
+    ".inst 0x4f82e118  // sdot v24.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x4f82e917  // sdot v23.4s, v8.16b, v2.4b[2]\n"
+    ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x4f81e915  // sdot v21.4s, v8.16b, v1.4b[2]\n"
+    ".inst 0x4fa5e13c  // sdot v28.4s, v9.16b, v5.4b[1]\n"
+    ".inst 0x4fa5e93b  // sdot v27.4s, v9.16b, v5.4b[3]\n"
+    ".inst 0x4fa4e13a  // sdot v26.4s, v9.16b, v4.4b[1]\n"
+    ".inst 0x4fa4e939  // sdot v25.4s, v9.16b, v4.4b[3]\n"
+    ".inst 0x4fa2e138  // sdot v24.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x4fa2e937  // sdot v23.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x4fa1e136  // sdot v22.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x4fa1e935  // sdot v21.4s, v9.16b, v1.4b[3]\n"
+    "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v20.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+    "and v18.16b, v28.16b, v19.16b\n"
+    "and v17.16b, v27.16b, v19.16b\n"
+    "and v16.16b, v26.16b, v19.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v18.4s\n"
+    "sqadd v27.4s, v27.4s, v17.4s\n"
+    "sqadd v26.4s, v26.4s, v16.4s\n"
+    "and v16.16b, v25.16b, v19.16b\n"
+    "srshl v28.4s, v28.4s, v19.4s\n"
+    "srshl v27.4s, v27.4s, v19.4s\n"
+    "srshl v26.4s, v26.4s, v19.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v28.4s, v28.4s, v0.4s\n"
+    "add v27.4s, v27.4s, v0.4s\n"
+    "add v26.4s, v26.4s, v0.4s\n"
+    "smin v28.4s, v28.4s, v30.4s\n"
+    "smin v27.4s, v27.4s, v30.4s\n"
+    "smin v26.4s, v26.4s, v30.4s\n"
+    "smax v28.4s, v28.4s, v31.4s\n"
+    "smax v27.4s, v27.4s, v31.4s\n"
+    "smax v26.4s, v26.4s, v31.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v20.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v19.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "and v16.16b, v24.16b, v19.16b\n"
+    "and v17.16b, v23.16b, v19.16b\n"
+    "add v25.4s, v25.4s, v0.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v25.4s, v25.4s, v30.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "smax v25.4s, v25.4s, v31.4s\n"
+    "and v16.16b, v22.16b, v19.16b\n"
+    "srshl v24.4s, v24.4s, v19.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "srshl v23.4s, v23.4s, v19.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v0.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v23.4s, v23.4s, v0.4s\n"
+    "smin v24.4s, v24.4s, v30.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "smin v23.4s, v23.4s, v30.4s\n"
+    "smax v24.4s, v24.4s, v31.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "smax v23.4s, v23.4s, v31.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "srshl v22.4s, v22.4s, v19.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "and v16.16b, v21.16b, v19.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "add v22.4s, v22.4s, v0.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v22.4s, v22.4s, v30.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v31.4s\n"
+    "srshl v21.4s, v21.4s, v19.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "add v21.4s, v21.4s, v0.4s\n"
+    "smin v21.4s, v21.4s, v30.4s\n"
+    "smax v21.4s, v21.4s, v31.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "blt 3f\n"
+    "str s28, [x26, #0x0]\n"
+    "str s27, [x25, #0x0]\n"
+    "str s26, [x24, #0x0]\n"
+    "str s25, [x23, #0x0]\n"
+    "str s24, [x22, #0x0]\n"
+    "str s23, [x21, #0x0]\n"
+    "str s22, [x20, #0x0]\n"
+    "str s21, [x19, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "st1 { v28.b }[0], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v27.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v25.b }[0], [x23], #0x1\n"
+    "st1 { v24.b }[0], [x22], #0x1\n"
+    "st1 { v23.b }[0], [x21], #0x1\n"
+    "st1 { v22.b }[0], [x20], #0x1\n"
+    "st1 { v21.b }[0], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v28.b }[1], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v27.b }[1], [x25], #0x1\n"
+    "st1 { v26.b }[1], [x24], #0x1\n"
+    "st1 { v25.b }[1], [x23], #0x1\n"
+    "st1 { v24.b }[1], [x22], #0x1\n"
+    "st1 { v23.b }[1], [x21], #0x1\n"
+    "st1 { v22.b }[1], [x20], #0x1\n"
+    "st1 { v21.b }[1], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v28.b }[2], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v27.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v25.b }[2], [x23], #0x1\n"
+    "st1 { v24.b }[2], [x22], #0x1\n"
+    "st1 { v23.b }[2], [x21], #0x1\n"
+    "st1 { v22.b }[2], [x20], #0x1\n"
+    "st1 { v21.b }[2], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v28.b }[3], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v27.b }[3], [x25], #0x1\n"
+    "st1 { v26.b }[3], [x24], #0x1\n"
+    "st1 { v25.b }[3], [x23], #0x1\n"
+    "st1 { v24.b }[3], [x22], #0x1\n"
+    "st1 { v23.b }[3], [x21], #0x1\n"
+    "st1 { v22.b }[3], [x20], #0x1\n"
+    "st1 { v21.b }[3], [x19], #0x1\n"
+    "4:"  // Tail: End
+    "add SP, SP, #0x80\n"
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000..d0ae00d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int output_rows(void) { return 2; };
+  constexpr static unsigned int output_cols(void) { return 8; };
+
+  constexpr static unsigned int output_col_regs(void) { return 2; };
+
+  kern_type kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+  a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..c0acd88
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp

@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const int8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x9, #0x0\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v13.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v12.16b }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v11.16b }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v10.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v9.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v8.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v7.4s }, [x19]\n"
+    "lsr x28, %x[n_output_channels], #0x2\n"
+    "cbz x28, 9f\n"
+    "1:"  // Output channel loop
+    "movi v16.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x19, x9, #0x2\n"
+    "ldr q16, [%x[bias], x19]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v6.16b, v16.16b\n"
+    "mov v5.16b, v16.16b\n"
+    "mov v4.16b, v16.16b\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v16.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "mov v28.16b, v16.16b\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "mov v20.16b, v16.16b\n"
+    "mov v19.16b, v16.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x19, x9, #0x2\n"
+    "ldr q8, [%x[rq_mul_ptr], x19]\n"
+    "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s17, [%x[weights]], #0x4\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "mov x19, %x[inptrs]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "ldr d3, [x25, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldr d2, [x27, #0x0]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "cbz x20, 7f\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "ssubl v16.8h, v16.8b, v11.8b\n"
+    "ldr d1, [x25, #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "ssubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d0, [x27, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "ldr d1, [x25, #0x0]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "ldr d0, [x27, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v12.8b\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "ssubl v16.8h, v16.8b, v11.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x28, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v16.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x19, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v16.d }[0], [x19], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v16.s }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v16.s }[0], [x19]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v6.16b, v16.16b\n"
+    "mov v5.16b, v16.16b\n"
+    "mov v4.16b, v16.16b\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v16.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "mov v28.16b, v16.16b\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "mov v20.16b, v16.16b\n"
+    "mov v19.16b, v16.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "ld1 { v9.d }[0], [x19], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v7.s }[2], [x20], #0x4\n"
+    "ld1 { v9.s }[2], [x19], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v7.s }[0], [x20], #0x4\n"
+    "ld1 { v9.s }[0], [x19], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v7.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v7.s }[0], [x20], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s17, [%x[weights]], #0x4\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "mov x19, %x[inptrs]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "ldr d3, [x25, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldr d2, [x27, #0x0]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "cbz x20, 22f\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "ssubl v16.8h, v16.8b, v11.8b\n"
+    "ldr d1, [x25, #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "ssubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d0, [x27, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "ldr d1, [x25, #0x0]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "ldr d0, [x27, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v12.8b\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "ssubl v16.8h, v16.8b, v11.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.h }[0], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.h }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.h }[0], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.h }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.h }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.h }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.h }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.h }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.h }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.h }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.h }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.h }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.h }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.h }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.h }[0], [x25]\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v19.h }[0], [x26]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.b }[2], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.b }[2], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.b }[2], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.b }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.b }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.b }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.b }[2], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.b }[2], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.b }[2], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.b }[2], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.b }[2], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.b }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.b }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.b }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.b }[2], [x25]\n"
+    "st1 { v19.b }[2], [x26]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.b }[0], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.b }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.b }[0], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.b }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.b }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.b }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.b }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.b }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.b }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.b }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.b }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.b }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.b }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.b }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.b }[0], [x25]\n"
+    "st1 { v19.b }[0], [x26]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+
+    "26:"  // Done
+
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..0fde00b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size;
+
+  kern_type kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+  a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..bdbda17
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,1184 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+  __asm__ __volatile__(
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "add SP, SP, #-0x80\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "lsr x28, %x[n_channels], #0x4\n"
+    "ldp x27, x26, [%x[inptrs], #0x30]\n"
+    "add x25, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ldp x24, x23, [%x[outptrs], #0x0]\n"
+    "add x22, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ldp x21, x20, [%x[outptrs], #0x10]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v12.4s }, [x25]\n"
+    "ld1r { v11.4s }, [x22]\n"
+    "ld1r { v10.4s }, [x19]\n"
+    "cbz x28, 2f\n"
+    "1:"  // Loop
+    "ldr q27, [x15, x11]\n"
+    "subs x28, x28, #0x1\n"
+    "ldr q1, [x14, x11]\n"
+    "ldp x15, x14, [%x[inptrs], #0x40]\n"
+    "ldr q25, [x13, x11]\n"
+    "zip1 v6.16b, v27.16b, v25.16b\n"
+    "ldr q23, [x12, x11]\n"
+    "zip2 v9.16b, v27.16b, v25.16b\n"
+    "ldp x13, x12, [%x[inptrs], #0x50]\n"
+    "ldr q31, [x10, x11]\n"
+    "zip1 v5.16b, v1.16b, v23.16b\n"
+    "ldr q28, [x9, x11]\n"
+    "zip2 v3.16b, v1.16b, v23.16b\n"
+    "ldp x10, x9, [%x[inptrs], #0x60]\n"
+    "zip1 v8.16b, v6.16b, v5.16b\n"
+    "ldr q21, [x27, x11]\n"
+    "zip2 v7.16b, v6.16b, v5.16b\n"
+    "ldr q26, [x26, x11]\n"
+    "zip1 v6.16b, v9.16b, v3.16b\n"
+    "ldp x27, x26, [%x[inptrs], #0x70]\n"
+    "zip2 v5.16b, v9.16b, v3.16b\n"
+    "ldr q24, [x15, x11]\n"
+    "ldr q22, [x14, x11]\n"
+    "zip1 v2.16b, v31.16b, v21.16b\n"
+    "zip2 v4.16b, v31.16b, v21.16b\n"
+    "ldp x15, x14, [%x[inptrs], #0x0]\n"
+    "zip1 v1.16b, v28.16b, v26.16b\n"
+    "ldr q20, [x13, x11]\n"
+    "zip2 v31.16b, v28.16b, v26.16b\n"
+    "ldr q16, [x12, x11]\n"
+    "zip1 v3.16b, v2.16b, v1.16b\n"
+    "ldp x13, x12, [%x[inptrs], #0x10]\n"
+    "zip2 v2.16b, v2.16b, v1.16b\n"
+    "ldr q19, [x10, x11]\n"
+    "zip1 v1.16b, v4.16b, v31.16b\n"
+    "ldr q0, [x9, x11]\n"
+    "zip1 v28.16b, v24.16b, v20.16b\n"
+    "ldp x10, x9, [%x[inptrs], #0x20]\n"
+    "zip2 v26.16b, v24.16b, v20.16b\n"
+    "ldr q18, [x27, x11]\n"
+    "zip1 v24.16b, v22.16b, v16.16b\n"
+    "ldr q17, [x26, x11]\n"
+    "zip2 v22.16b, v22.16b, v16.16b\n"
+    "ldp x27, x26, [%x[inptrs], #0x30]\n"
+    "zip2 v16.16b, v4.16b, v31.16b\n"
+    "str q6, [SP, #0x0]\n"
+    "zip1 v31.16b, v28.16b, v24.16b\n"
+    "str q5, [SP, #0x10]\n"
+    "zip1 v20.16b, v19.16b, v18.16b\n"
+    "str q1, [SP, #0x20]\n"
+    "zip2 v19.16b, v19.16b, v18.16b\n"
+    "str q16, [SP, #0x30]\n"
+    "zip1 v18.16b, v0.16b, v17.16b\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "zip2 v17.16b, v0.16b, v17.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "zip2 v28.16b, v28.16b, v24.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "zip1 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x40]\n"
+    "zip2 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x50]\n"
+    "zip1 v26.16b, v20.16b, v18.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "zip2 v24.16b, v20.16b, v18.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "zip1 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x60]\n"
+    "zip2 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x70]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "mov v20.16b, v30.16b\n"
+    "mov v19.16b, v30.16b\n"
+    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
+    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
+    "ldr q8, [SP, #0x0]\n"
+    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
+    "ldr q29, [%x[params], #0x70]\n"
+    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
+    "ldr q3, [SP, #0x20]\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
+    "ldr q27, [%x[params], #0x80]\n"
+    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
+    "ldr q31, [SP, #0x40]\n"
+    "and v16.16b, v30.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
+    "ldr q25, [%x[params], #0x90]\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "ldr q26, [SP, #0x60]\n"
+    "and v18.16b, v20.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0xa0]\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v18.4s\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "smax v30.4s, v30.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0xb0]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x24, x11]\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "ldr q30, [%x[params], #0x60]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x11]\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x21, x11]\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x20, x11]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x11, x11, #0x4\n"
+    ".inst 0x4e8797be  // sdot v30.4s, v29.16b, v7.16b\n"
+    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
+    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e8797b6  // sdot v22.4s, v29.16b, v7.16b\n"
+    "ldr q7, [SP, #0x10]\n"
+    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
+    "ldr q29, [%x[params], #0xd0]\n"
+    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
+    "ldr q2, [SP, #0x30]\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
+    "ldr q27, [%x[params], #0xe0]\n"
+    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
+    "ldr q28, [SP, #0x50]\n"
+    "and v16.16b, v30.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
+    "ldr q25, [%x[params], #0xf0]\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "ldr q24, [SP, #0x70]\n"
+    "and v18.16b, v20.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x100]\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v18.4s\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "smax v30.4s, v30.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0x110]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x24, x11]\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "ldr q30, [%x[params], #0xc0]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x11]\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x21, x11]\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x20, x11]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x11, x11, #0x4\n"
+    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
+    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
+    "ldr q29, [%x[params], #0x130]\n"
+    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
+    "ldr q27, [%x[params], #0x140]\n"
+    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
+    "and v16.16b, v30.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
+    "ldr q25, [%x[params], #0x150]\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "and v18.16b, v20.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x160]\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v20.4s, v20.4s, v18.4s\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "smax v30.4s, v30.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0x170]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x24, x11]\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "ldr q30, [%x[params], #0x120]\n"
+    "add %x[params], %x[params], #0x180\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x11]\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x21, x11]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "mov v20.16b, v30.16b\n"
+    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x20, x11]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x11, x11, #0x4\n"
+    ".inst 0x4e8797be  // sdot v30.4s, v29.16b, v7.16b\n"
+    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
+    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e8797b6  // sdot v22.4s, v29.16b, v7.16b\n"
+    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
+    "and v16.16b, v30.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "and v18.16b, v20.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v18.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x24, x11]\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x11]\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x21, x11]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x20, x11]\n"
+    "add x11, x11, #0x4\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0xf\n"
+    "beq 34f\n"
+    "2:"  // Oddments
+    "and x19, %x[n_channels], #0xf\n"
+    "add x15, x15, x11\n"
+    "add x14, x14, x11\n"
+    "add x13, x13, x11\n"
+    "add x12, x12, x11\n"
+    "add x10, x10, x11\n"
+    "add x9, x9, x11\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "tbz %x[n_channels], #3, 6f\n"
+    "ld1 { v27.d }[0], [x15], #0x8\n"
+    "ld1 { v1.d }[0], [x14], #0x8\n"
+    "ld1 { v25.d }[0], [x13], #0x8\n"
+    "ld1 { v23.d }[0], [x12], #0x8\n"
+    "ld1 { v31.d }[0], [x10], #0x8\n"
+    "ld1 { v28.d }[0], [x9], #0x8\n"
+    "ld1 { v21.d }[0], [x27], #0x8\n"
+    "ld1 { v26.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #2, 4f\n"
+    "ld1 { v27.s }[2], [x15], #0x4\n"
+    "ld1 { v1.s }[2], [x14], #0x4\n"
+    "ld1 { v25.s }[2], [x13], #0x4\n"
+    "ld1 { v23.s }[2], [x12], #0x4\n"
+    "ld1 { v31.s }[2], [x10], #0x4\n"
+    "ld1 { v28.s }[2], [x9], #0x4\n"
+    "ld1 { v21.s }[2], [x27], #0x4\n"
+    "ld1 { v26.s }[2], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 3f\n"
+    "ld1 { v27.h }[6], [x15], #0x2\n"
+    "ld1 { v1.h }[6], [x14], #0x2\n"
+    "ld1 { v25.h }[6], [x13], #0x2\n"
+    "ld1 { v23.h }[6], [x12], #0x2\n"
+    "ld1 { v31.h }[6], [x10], #0x2\n"
+    "ld1 { v28.h }[6], [x9], #0x2\n"
+    "ld1 { v21.h }[6], [x27], #0x2\n"
+    "ld1 { v26.h }[6], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[14], [x15], #0x1\n"
+    "ld1 { v1.b }[14], [x14], #0x1\n"
+    "ld1 { v25.b }[14], [x13], #0x1\n"
+    "ld1 { v23.b }[14], [x12], #0x1\n"
+    "ld1 { v31.b }[14], [x10], #0x1\n"
+    "ld1 { v28.b }[14], [x9], #0x1\n"
+    "ld1 { v21.b }[14], [x27], #0x1\n"
+    "ld1 { v26.b }[14], [x26], #0x1\n"
+    "b 10f\n"
+    "3:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[12], [x15], #0x1\n"
+    "ld1 { v1.b }[12], [x14], #0x1\n"
+    "ld1 { v25.b }[12], [x13], #0x1\n"
+    "ld1 { v23.b }[12], [x12], #0x1\n"
+    "ld1 { v31.b }[12], [x10], #0x1\n"
+    "ld1 { v28.b }[12], [x9], #0x1\n"
+    "ld1 { v21.b }[12], [x27], #0x1\n"
+    "ld1 { v26.b }[12], [x26], #0x1\n"
+    "b 10f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v27.h }[4], [x15], #0x2\n"
+    "ld1 { v1.h }[4], [x14], #0x2\n"
+    "ld1 { v25.h }[4], [x13], #0x2\n"
+    "ld1 { v23.h }[4], [x12], #0x2\n"
+    "ld1 { v31.h }[4], [x10], #0x2\n"
+    "ld1 { v28.h }[4], [x9], #0x2\n"
+    "ld1 { v21.h }[4], [x27], #0x2\n"
+    "ld1 { v26.h }[4], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[10], [x15], #0x1\n"
+    "ld1 { v1.b }[10], [x14], #0x1\n"
+    "ld1 { v25.b }[10], [x13], #0x1\n"
+    "ld1 { v23.b }[10], [x12], #0x1\n"
+    "ld1 { v31.b }[10], [x10], #0x1\n"
+    "ld1 { v28.b }[10], [x9], #0x1\n"
+    "ld1 { v21.b }[10], [x27], #0x1\n"
+    "ld1 { v26.b }[10], [x26], #0x1\n"
+    "b 10f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[8], [x15], #0x1\n"
+    "ld1 { v1.b }[8], [x14], #0x1\n"
+    "ld1 { v25.b }[8], [x13], #0x1\n"
+    "ld1 { v23.b }[8], [x12], #0x1\n"
+    "ld1 { v31.b }[8], [x10], #0x1\n"
+    "ld1 { v28.b }[8], [x9], #0x1\n"
+    "ld1 { v21.b }[8], [x27], #0x1\n"
+    "ld1 { v26.b }[8], [x26], #0x1\n"
+    "b 10f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 8f\n"
+    "ld1 { v27.s }[0], [x15], #0x4\n"
+    "ld1 { v1.s }[0], [x14], #0x4\n"
+    "ld1 { v25.s }[0], [x13], #0x4\n"
+    "ld1 { v23.s }[0], [x12], #0x4\n"
+    "ld1 { v31.s }[0], [x10], #0x4\n"
+    "ld1 { v28.s }[0], [x9], #0x4\n"
+    "ld1 { v21.s }[0], [x27], #0x4\n"
+    "ld1 { v26.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v27.h }[2], [x15], #0x2\n"
+    "ld1 { v1.h }[2], [x14], #0x2\n"
+    "ld1 { v25.h }[2], [x13], #0x2\n"
+    "ld1 { v23.h }[2], [x12], #0x2\n"
+    "ld1 { v31.h }[2], [x10], #0x2\n"
+    "ld1 { v28.h }[2], [x9], #0x2\n"
+    "ld1 { v21.h }[2], [x27], #0x2\n"
+    "ld1 { v26.h }[2], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[6], [x15], #0x1\n"
+    "ld1 { v1.b }[6], [x14], #0x1\n"
+    "ld1 { v25.b }[6], [x13], #0x1\n"
+    "ld1 { v23.b }[6], [x12], #0x1\n"
+    "ld1 { v31.b }[6], [x10], #0x1\n"
+    "ld1 { v28.b }[6], [x9], #0x1\n"
+    "ld1 { v21.b }[6], [x27], #0x1\n"
+    "ld1 { v26.b }[6], [x26], #0x1\n"
+    "b 10f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[4], [x15], #0x1\n"
+    "ld1 { v1.b }[4], [x14], #0x1\n"
+    "ld1 { v25.b }[4], [x13], #0x1\n"
+    "ld1 { v23.b }[4], [x12], #0x1\n"
+    "ld1 { v31.b }[4], [x10], #0x1\n"
+    "ld1 { v28.b }[4], [x9], #0x1\n"
+    "ld1 { v21.b }[4], [x27], #0x1\n"
+    "ld1 { v26.b }[4], [x26], #0x1\n"
+    "b 10f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v27.h }[0], [x15], #0x2\n"
+    "ld1 { v1.h }[0], [x14], #0x2\n"
+    "ld1 { v25.h }[0], [x13], #0x2\n"
+    "ld1 { v23.h }[0], [x12], #0x2\n"
+    "ld1 { v31.h }[0], [x10], #0x2\n"
+    "ld1 { v28.h }[0], [x9], #0x2\n"
+    "ld1 { v21.h }[0], [x27], #0x2\n"
+    "ld1 { v26.h }[0], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[2], [x15], #0x1\n"
+    "ld1 { v1.b }[2], [x14], #0x1\n"
+    "ld1 { v25.b }[2], [x13], #0x1\n"
+    "ld1 { v23.b }[2], [x12], #0x1\n"
+    "ld1 { v31.b }[2], [x10], #0x1\n"
+    "ld1 { v28.b }[2], [x9], #0x1\n"
+    "ld1 { v21.b }[2], [x27], #0x1\n"
+    "ld1 { v26.b }[2], [x26], #0x1\n"
+    "b 10f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[0], [x15], #0x1\n"
+    "ld1 { v1.b }[0], [x14], #0x1\n"
+    "ld1 { v25.b }[0], [x13], #0x1\n"
+    "ld1 { v23.b }[0], [x12], #0x1\n"
+    "ld1 { v31.b }[0], [x10], #0x1\n"
+    "ld1 { v28.b }[0], [x9], #0x1\n"
+    "ld1 { v21.b }[0], [x27], #0x1\n"
+    "ld1 { v26.b }[0], [x26], #0x1\n"
+    "10:"  // Oddments: Load (A): Bit 3: End
+    "ldp x15, x14, [%x[inptrs], #0x40]\n"
+    "add x15, x15, x11\n"
+    "ldp x13, x12, [%x[inptrs], #0x50]\n"
+    "ldp x10, x9, [%x[inptrs], #0x60]\n"
+    "add x14, x14, x11\n"
+    "ldp x27, x26, [%x[inptrs], #0x70]\n"
+    "add x13, x13, x11\n"
+    "add x12, x12, x11\n"
+    "add x10, x10, x11\n"
+    "add x9, x9, x11\n"
+    "add x27, x27, x11\n"
+    "add x26, x26, x11\n"
+    "tbz %x[n_channels], #3, 14f\n"
+    "ld1 { v24.d }[0], [x15], #0x8\n"
+    "ld1 { v22.d }[0], [x14], #0x8\n"
+    "ld1 { v20.d }[0], [x13], #0x8\n"
+    "ld1 { v16.d }[0], [x12], #0x8\n"
+    "ld1 { v19.d }[0], [x10], #0x8\n"
+    "ld1 { v0.d }[0], [x9], #0x8\n"
+    "ld1 { v18.d }[0], [x27], #0x8\n"
+    "ld1 { v17.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #2, 12f\n"
+    "ld1 { v24.s }[2], [x15], #0x4\n"
+    "ld1 { v22.s }[2], [x14], #0x4\n"
+    "ld1 { v20.s }[2], [x13], #0x4\n"
+    "ld1 { v16.s }[2], [x12], #0x4\n"
+    "ld1 { v19.s }[2], [x10], #0x4\n"
+    "ld1 { v0.s }[2], [x9], #0x4\n"
+    "ld1 { v18.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ld1 { v24.h }[6], [x15], #0x2\n"
+    "ld1 { v22.h }[6], [x14], #0x2\n"
+    "ld1 { v20.h }[6], [x13], #0x2\n"
+    "ld1 { v16.h }[6], [x12], #0x2\n"
+    "ld1 { v19.h }[6], [x10], #0x2\n"
+    "ld1 { v0.h }[6], [x9], #0x2\n"
+    "ld1 { v18.h }[6], [x27], #0x2\n"
+    "ld1 { v17.h }[6], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[14], [x15], #0x1\n"
+    "ld1 { v22.b }[14], [x14], #0x1\n"
+    "ld1 { v20.b }[14], [x13], #0x1\n"
+    "ld1 { v16.b }[14], [x12], #0x1\n"
+    "ld1 { v19.b }[14], [x10], #0x1\n"
+    "ld1 { v0.b }[14], [x9], #0x1\n"
+    "ld1 { v18.b }[14], [x27], #0x1\n"
+    "ld1 { v17.b }[14], [x26], #0x1\n"
+    "b 18f\n"
+    "11:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[12], [x15], #0x1\n"
+    "ld1 { v22.b }[12], [x14], #0x1\n"
+    "ld1 { v20.b }[12], [x13], #0x1\n"
+    "ld1 { v16.b }[12], [x12], #0x1\n"
+    "ld1 { v19.b }[12], [x10], #0x1\n"
+    "ld1 { v0.b }[12], [x9], #0x1\n"
+    "ld1 { v18.b }[12], [x27], #0x1\n"
+    "ld1 { v17.b }[12], [x26], #0x1\n"
+    "b 18f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v24.h }[4], [x15], #0x2\n"
+    "ld1 { v22.h }[4], [x14], #0x2\n"
+    "ld1 { v20.h }[4], [x13], #0x2\n"
+    "ld1 { v16.h }[4], [x12], #0x2\n"
+    "ld1 { v19.h }[4], [x10], #0x2\n"
+    "ld1 { v0.h }[4], [x9], #0x2\n"
+    "ld1 { v18.h }[4], [x27], #0x2\n"
+    "ld1 { v17.h }[4], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[10], [x15], #0x1\n"
+    "ld1 { v22.b }[10], [x14], #0x1\n"
+    "ld1 { v20.b }[10], [x13], #0x1\n"
+    "ld1 { v16.b }[10], [x12], #0x1\n"
+    "ld1 { v19.b }[10], [x10], #0x1\n"
+    "ld1 { v0.b }[10], [x9], #0x1\n"
+    "ld1 { v18.b }[10], [x27], #0x1\n"
+    "ld1 { v17.b }[10], [x26], #0x1\n"
+    "b 18f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[8], [x15], #0x1\n"
+    "ld1 { v22.b }[8], [x14], #0x1\n"
+    "ld1 { v20.b }[8], [x13], #0x1\n"
+    "ld1 { v16.b }[8], [x12], #0x1\n"
+    "ld1 { v19.b }[8], [x10], #0x1\n"
+    "ld1 { v0.b }[8], [x9], #0x1\n"
+    "ld1 { v18.b }[8], [x27], #0x1\n"
+    "ld1 { v17.b }[8], [x26], #0x1\n"
+    "b 18f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 16f\n"
+    "ld1 { v24.s }[0], [x15], #0x4\n"
+    "ld1 { v22.s }[0], [x14], #0x4\n"
+    "ld1 { v20.s }[0], [x13], #0x4\n"
+    "ld1 { v16.s }[0], [x12], #0x4\n"
+    "ld1 { v19.s }[0], [x10], #0x4\n"
+    "ld1 { v0.s }[0], [x9], #0x4\n"
+    "ld1 { v18.s }[0], [x27], #0x4\n"
+    "ld1 { v17.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ld1 { v24.h }[2], [x15], #0x2\n"
+    "ld1 { v22.h }[2], [x14], #0x2\n"
+    "ld1 { v20.h }[2], [x13], #0x2\n"
+    "ld1 { v16.h }[2], [x12], #0x2\n"
+    "ld1 { v19.h }[2], [x10], #0x2\n"
+    "ld1 { v0.h }[2], [x9], #0x2\n"
+    "ld1 { v18.h }[2], [x27], #0x2\n"
+    "ld1 { v17.h }[2], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[6], [x15], #0x1\n"
+    "ld1 { v22.b }[6], [x14], #0x1\n"
+    "ld1 { v20.b }[6], [x13], #0x1\n"
+    "ld1 { v16.b }[6], [x12], #0x1\n"
+    "ld1 { v19.b }[6], [x10], #0x1\n"
+    "ld1 { v0.b }[6], [x9], #0x1\n"
+    "ld1 { v18.b }[6], [x27], #0x1\n"
+    "ld1 { v17.b }[6], [x26], #0x1\n"
+    "b 18f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[4], [x15], #0x1\n"
+    "ld1 { v22.b }[4], [x14], #0x1\n"
+    "ld1 { v20.b }[4], [x13], #0x1\n"
+    "ld1 { v16.b }[4], [x12], #0x1\n"
+    "ld1 { v19.b }[4], [x10], #0x1\n"
+    "ld1 { v0.b }[4], [x9], #0x1\n"
+    "ld1 { v18.b }[4], [x27], #0x1\n"
+    "ld1 { v17.b }[4], [x26], #0x1\n"
+    "b 18f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v24.h }[0], [x15], #0x2\n"
+    "ld1 { v22.h }[0], [x14], #0x2\n"
+    "ld1 { v20.h }[0], [x13], #0x2\n"
+    "ld1 { v16.h }[0], [x12], #0x2\n"
+    "ld1 { v19.h }[0], [x10], #0x2\n"
+    "ld1 { v0.h }[0], [x9], #0x2\n"
+    "ld1 { v18.h }[0], [x27], #0x2\n"
+    "ld1 { v17.h }[0], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[2], [x15], #0x1\n"
+    "ld1 { v22.b }[2], [x14], #0x1\n"
+    "ld1 { v20.b }[2], [x13], #0x1\n"
+    "ld1 { v16.b }[2], [x12], #0x1\n"
+    "ld1 { v19.b }[2], [x10], #0x1\n"
+    "ld1 { v0.b }[2], [x9], #0x1\n"
+    "ld1 { v18.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "b 18f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[0], [x15], #0x1\n"
+    "ld1 { v22.b }[0], [x14], #0x1\n"
+    "ld1 { v20.b }[0], [x13], #0x1\n"
+    "ld1 { v16.b }[0], [x12], #0x1\n"
+    "ld1 { v19.b }[0], [x10], #0x1\n"
+    "ld1 { v0.b }[0], [x9], #0x1\n"
+    "ld1 { v18.b }[0], [x27], #0x1\n"
+    "ld1 { v17.b }[0], [x26], #0x1\n"
+    "18:"  // Oddments: Load (B): Bit 3: End
+    "zip1 v6.16b, v27.16b, v25.16b\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "cmp x19, #0x4\n"
+    "zip2 v9.16b, v27.16b, v25.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "zip1 v5.16b, v1.16b, v23.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "zip2 v3.16b, v1.16b, v23.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "zip1 v2.16b, v31.16b, v21.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "zip2 v4.16b, v31.16b, v21.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "zip1 v1.16b, v28.16b, v26.16b\n"
+    "zip2 v31.16b, v28.16b, v26.16b\n"
+    "zip1 v28.16b, v24.16b, v20.16b\n"
+    "zip2 v26.16b, v24.16b, v20.16b\n"
+    "zip1 v24.16b, v22.16b, v16.16b\n"
+    "zip2 v22.16b, v22.16b, v16.16b\n"
+    "zip1 v20.16b, v19.16b, v18.16b\n"
+    "zip2 v19.16b, v19.16b, v18.16b\n"
+    "zip1 v18.16b, v0.16b, v17.16b\n"
+    "zip2 v17.16b, v0.16b, v17.16b\n"
+    "zip1 v8.16b, v6.16b, v5.16b\n"
+    "zip2 v7.16b, v6.16b, v5.16b\n"
+    "zip1 v6.16b, v9.16b, v3.16b\n"
+    "str q6, [SP, #0x0]\n"
+    "zip2 v5.16b, v9.16b, v3.16b\n"
+    "str q5, [SP, #0x10]\n"
+    "zip1 v3.16b, v2.16b, v1.16b\n"
+    "zip2 v2.16b, v2.16b, v1.16b\n"
+    "zip1 v1.16b, v4.16b, v31.16b\n"
+    "str q1, [SP, #0x20]\n"
+    "zip2 v16.16b, v4.16b, v31.16b\n"
+    "str q16, [SP, #0x30]\n"
+    "zip1 v31.16b, v28.16b, v24.16b\n"
+    "zip2 v28.16b, v28.16b, v24.16b\n"
+    "zip1 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x40]\n"
+    "zip2 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x50]\n"
+    "zip1 v26.16b, v20.16b, v18.16b\n"
+    "zip2 v24.16b, v20.16b, v18.16b\n"
+    "zip1 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x60]\n"
+    "zip2 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x70]\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "mov v19.16b, v30.16b\n"
+    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
+    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
+    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
+    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
+    "and v16.16b, v30.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "and v18.16b, v20.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v18.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 19f\n"
+    "str s30, [x24, x11]\n"
+    "str s22, [x23, x11]\n"
+    "str s20, [x21, x11]\n"
+    "str s19, [x20, x11]\n"
+    "b 22f\n"
+    "19:"  // Oddments: Unroll 0: Oddment store
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "tbz x19, #1, 20f\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v22.h }[0], [x23], #0x2\n"
+    "st1 { v20.h }[0], [x21], #0x2\n"
+    "st1 { v19.h }[0], [x20], #0x2\n"
+    "tbz x19, #0, 21f\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v22.b }[2], [x23], #0x1\n"
+    "st1 { v20.b }[2], [x21], #0x1\n"
+    "st1 { v19.b }[2], [x20], #0x1\n"
+    "b 21f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 21f\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v22.b }[0], [x23], #0x1\n"
+    "st1 { v20.b }[0], [x21], #0x1\n"
+    "st1 { v19.b }[0], [x20], #0x1\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+
+    "22:"  // Oddments: Unroll 0: After oddment store
+    "add x11, x11, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "cmp x19, #0x4\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "mov v19.16b, v30.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    ".inst 0x4e8797be  // sdot v30.4s, v29.16b, v7.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
+    ".inst 0x4e8797b6  // sdot v22.4s, v29.16b, v7.16b\n"
+    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
+    "and v16.16b, v30.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "and v18.16b, v20.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v18.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 23f\n"
+    "str s30, [x24, x11]\n"
+    "str s22, [x23, x11]\n"
+    "str s20, [x21, x11]\n"
+    "str s19, [x20, x11]\n"
+    "b 26f\n"
+    "23:"  // Oddments: Unroll 1: Oddment store
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "tbz x19, #1, 24f\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v22.h }[0], [x23], #0x2\n"
+    "st1 { v20.h }[0], [x21], #0x2\n"
+    "st1 { v19.h }[0], [x20], #0x2\n"
+    "tbz x19, #0, 25f\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v22.b }[2], [x23], #0x1\n"
+    "st1 { v20.b }[2], [x21], #0x1\n"
+    "st1 { v19.b }[2], [x20], #0x1\n"
+    "b 25f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 25f\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v22.b }[0], [x23], #0x1\n"
+    "st1 { v20.b }[0], [x21], #0x1\n"
+    "st1 { v19.b }[0], [x20], #0x1\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+
+    "26:"  // Oddments: Unroll 1: After oddment store
+    "add x11, x11, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "ldr q8, [SP, #0x0]\n"
+    "ldr q3, [SP, #0x20]\n"
+    "cmp x19, #0x4\n"
+    "ldr q31, [SP, #0x40]\n"
+    "ldr q26, [SP, #0x60]\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "mov v19.16b, v30.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
+    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
+    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
+    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
+    "and v16.16b, v30.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "and v18.16b, v20.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v18.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 27f\n"
+    "str s30, [x24, x11]\n"
+    "str s22, [x23, x11]\n"
+    "str s20, [x21, x11]\n"
+    "str s19, [x20, x11]\n"
+    "b 30f\n"
+    "27:"  // Oddments: Unroll 2: Oddment store
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "tbz x19, #1, 28f\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v22.h }[0], [x23], #0x2\n"
+    "st1 { v20.h }[0], [x21], #0x2\n"
+    "st1 { v19.h }[0], [x20], #0x2\n"
+    "tbz x19, #0, 29f\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v22.b }[2], [x23], #0x1\n"
+    "st1 { v20.b }[2], [x21], #0x1\n"
+    "st1 { v19.b }[2], [x20], #0x1\n"
+    "b 29f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 29f\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v22.b }[0], [x23], #0x1\n"
+    "st1 { v20.b }[0], [x21], #0x1\n"
+    "st1 { v19.b }[0], [x20], #0x1\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+
+    "30:"  // Oddments: Unroll 2: After oddment store
+    "add x11, x11, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "ldr q7, [SP, #0x10]\n"
+    "ldr q2, [SP, #0x30]\n"
+    "ldr q28, [SP, #0x50]\n"
+    "ldr q24, [SP, #0x70]\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "mov v19.16b, v30.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    ".inst 0x4e8797be  // sdot v30.4s, v29.16b, v7.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
+    "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
+    ".inst 0x4e8797b6  // sdot v22.4s, v29.16b, v7.16b\n"
+    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
+    "and v16.16b, v30.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "and v18.16b, v20.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v18.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v30.4s, v30.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "smin v30.4s, v30.4s, v11.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "31:"  // Oddments: Unroll 3: Oddment store
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x21, x21, x11\n"
+    "add x20, x20, x11\n"
+    "tbz x19, #1, 32f\n"
+    "st1 { v30.h }[0], [x24], #0x2\n"
+    "st1 { v22.h }[0], [x23], #0x2\n"
+    "st1 { v20.h }[0], [x21], #0x2\n"
+    "st1 { v19.h }[0], [x20], #0x2\n"
+    "tbz x19, #0, 33f\n"
+    "st1 { v30.b }[2], [x24], #0x1\n"
+    "st1 { v22.b }[2], [x23], #0x1\n"
+    "st1 { v20.b }[2], [x21], #0x1\n"
+    "st1 { v19.b }[2], [x20], #0x1\n"
+    "b 33f\n"
+    "32:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 33f\n"
+    "st1 { v30.b }[0], [x24], #0x1\n"
+    "st1 { v22.b }[0], [x23], #0x1\n"
+    "st1 { v20.b }[0], [x21], #0x1\n"
+    "st1 { v19.b }[0], [x20], #0x1\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+
+    "34:"  // End
+    "add SP, SP, #0x80\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..05eddd1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+  typedef uint32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_dot::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_dot::get_packed_size;
+
+  kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+  a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..22c584f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+  __asm__ __volatile__(
+    "ldp x13, x12, [%x[inptrs], #0x0]\n"
+    "add SP, SP, #-0x80\n"
+    "ldp x11, x10, [%x[inptrs], #0x10]\n"
+    "mov x19, #0x1\n"
+    "ldp x9, x28, [%x[inptrs], #0x20]\n"
+    "orr x19, x19, #0x100\n"
+    "ldp x27, x26, [%x[inptrs], #0x30]\n"
+    "orr x19, x19, #0x10000\n"
+    "dup v11.4s, w19\n"
+    "ldp x25, x24, [%x[outptrs], #0x0]\n"
+    "mov x23, #0x0\n"
+    "ldp x22, x21, [%x[outptrs], #0x10]\n"
+    "lsr x20, %x[n_channels], #0x4\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v9.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v12.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.4s }, [x19]\n"
+    "cbz x20, 2f\n"
+    "1:"  // Loop
+    "movi v15.4s, #0x0\n"
+    "ldr q27, [x13, x23]\n"
+    "subs x20, x20, #0x1\n"
+    "movi v10.4s, #0x0\n"
+    "ldr q1, [x12, x23]\n"
+    "ldp x13, x12, [%x[inptrs], #0x40]\n"
+    "ldr q25, [x11, x23]\n"
+    "zip1 v7.16b, v27.16b, v25.16b\n"
+    "ldr q23, [x10, x23]\n"
+    "zip2 v5.16b, v27.16b, v25.16b\n"
+    "ldp x11, x10, [%x[inptrs], #0x50]\n"
+    "ldr q31, [x9, x23]\n"
+    "zip1 v8.16b, v1.16b, v23.16b\n"
+    "ldr q28, [x28, x23]\n"
+    "zip2 v3.16b, v1.16b, v23.16b\n"
+    "ldp x9, x28, [%x[inptrs], #0x60]\n"
+    "zip1 v6.16b, v7.16b, v8.16b\n"
+    "ldr q21, [x27, x23]\n"
+    "zip2 v8.16b, v7.16b, v8.16b\n"
+    "ldr q26, [x26, x23]\n"
+    "zip1 v7.16b, v5.16b, v3.16b\n"
+    "ldp x27, x26, [%x[inptrs], #0x70]\n"
+    "zip2 v5.16b, v5.16b, v3.16b\n"
+    "ldr q24, [x13, x23]\n"
+    "ldr q22, [x12, x23]\n"
+    "zip1 v2.16b, v31.16b, v21.16b\n"
+    "zip2 v4.16b, v31.16b, v21.16b\n"
+    "ldp x13, x12, [%x[inptrs], #0x0]\n"
+    "zip1 v1.16b, v28.16b, v26.16b\n"
+    "ldr q20, [x11, x23]\n"
+    "zip2 v31.16b, v28.16b, v26.16b\n"
+    "ldr q16, [x10, x23]\n"
+    "zip1 v3.16b, v2.16b, v1.16b\n"
+    "ldp x11, x10, [%x[inptrs], #0x10]\n"
+    "zip2 v2.16b, v2.16b, v1.16b\n"
+    "ldr q19, [x9, x23]\n"
+    "zip1 v1.16b, v4.16b, v31.16b\n"
+    "ldr q0, [x28, x23]\n"
+    "zip1 v28.16b, v24.16b, v20.16b\n"
+    "ldp x9, x28, [%x[inptrs], #0x20]\n"
+    "zip2 v26.16b, v24.16b, v20.16b\n"
+    "ldr q18, [x27, x23]\n"
+    "zip1 v24.16b, v22.16b, v16.16b\n"
+    "ldr q17, [x26, x23]\n"
+    "zip2 v22.16b, v22.16b, v16.16b\n"
+    "ldp x27, x26, [%x[inptrs], #0x30]\n"
+    "zip2 v16.16b, v4.16b, v31.16b\n"
+    "str q7, [SP, #0x0]\n"
+    "zip1 v31.16b, v28.16b, v24.16b\n"
+    "str q5, [SP, #0x10]\n"
+    "zip1 v20.16b, v19.16b, v18.16b\n"
+    "str q1, [SP, #0x20]\n"
+    "zip2 v19.16b, v19.16b, v18.16b\n"
+    "str q16, [SP, #0x30]\n"
+    "zip1 v18.16b, v0.16b, v17.16b\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "zip2 v17.16b, v0.16b, v17.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "zip2 v28.16b, v28.16b, v24.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "zip1 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x40]\n"
+    "zip2 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x50]\n"
+    "zip1 v26.16b, v20.16b, v18.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "zip2 v24.16b, v20.16b, v18.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "zip1 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x60]\n"
+    "zip2 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x70]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "mov v20.16b, v30.16b\n"
+    "mov v19.16b, v30.16b\n"
+    ".inst 0x6e8697be  // udot v30.4s, v29.16b, v6.16b\n"
+    ".inst 0x6e8397b4  // udot v20.4s, v29.16b, v3.16b\n"
+    ".inst 0x6e83956f  // udot v15.4s, v11.16b, v3.16b\n"
+    ".inst 0x6e83977e  // udot v30.4s, v27.16b, v3.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    ".inst 0x6e9f9774  // udot v20.4s, v27.16b, v31.16b\n"
+    ".inst 0x6e9f956f  // udot v15.4s, v11.16b, v31.16b\n"
+    ".inst 0x6e9f973e  // udot v30.4s, v25.16b, v31.16b\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    ".inst 0x6e9a9734  // udot v20.4s, v25.16b, v26.16b\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x6e86956f  // udot v15.4s, v11.16b, v6.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x6e9a9571  // udot v17.4s, v11.16b, v26.16b\n"
+    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x6e8697b6  // udot v22.4s, v29.16b, v6.16b\n"
+    ".inst 0x6e8397b3  // udot v19.4s, v29.16b, v3.16b\n"
+    "ldr q29, [%x[params], #0x70]\n"
+    ".inst 0x6e83956a  // udot v10.4s, v11.16b, v3.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x6e839776  // udot v22.4s, v27.16b, v3.16b\n"
+    "ldr q3, [SP, #0x20]\n"
+    ".inst 0x6e9f9773  // udot v19.4s, v27.16b, v31.16b\n"
+    "ldr q27, [%x[params], #0x80]\n"
+    ".inst 0x6e9f956a  // udot v10.4s, v11.16b, v31.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    ".inst 0x6e9f9736  // udot v22.4s, v25.16b, v31.16b\n"
+    "ldr q31, [SP, #0x40]\n"
+    ".inst 0x6e9a9733  // udot v19.4s, v25.16b, v26.16b\n"
+    "ldr q25, [%x[params], #0x90]\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x6e86956a  // udot v10.4s, v11.16b, v6.16b\n"
+    "ldr q6, [SP, #0x0]\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x6e9a9571  // udot v17.4s, v11.16b, v26.16b\n"
+    "ldr q26, [SP, #0x60]\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "movi v15.4s, #0x0\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    ".inst 0x6e82956f  // udot v15.4s, v11.16b, v2.16b\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0xa0]\n"
+    ".inst 0x6e9c956f  // udot v15.4s, v11.16b, v28.16b\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "mov v17.16b, v15.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x25, x23]\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "ldr q30, [%x[params], #0x60]\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    ".inst 0x6e88956f  // udot v15.4s, v11.16b, v8.16b\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0xb0]\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    ".inst 0x6e989571  // udot v17.4s, v11.16b, v24.16b\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x22, x23]\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x23]\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    ".inst 0x6e8297b4  // udot v20.4s, v29.16b, v2.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x21, x23]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x23, x23, #0x4\n"
+    ".inst 0x6e8897be  // udot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x6e9c9774  // udot v20.4s, v27.16b, v28.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "movi v10.4s, #0x0\n"
+    ".inst 0x6e82977e  // udot v30.4s, v27.16b, v2.16b\n"
+    ".inst 0x6e989734  // udot v20.4s, v25.16b, v24.16b\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    ".inst 0x6e9c973e  // udot v30.4s, v25.16b, v28.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e8897b6  // udot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x6e8297b3  // udot v19.4s, v29.16b, v2.16b\n"
+    "ldr q29, [%x[params], #0xd0]\n"
+    ".inst 0x6e82956a  // udot v10.4s, v11.16b, v2.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x6e829776  // udot v22.4s, v27.16b, v2.16b\n"
+    "ldr q2, [SP, #0x30]\n"
+    ".inst 0x6e9c9773  // udot v19.4s, v27.16b, v28.16b\n"
+    "ldr q27, [%x[params], #0xe0]\n"
+    ".inst 0x6e9c956a  // udot v10.4s, v11.16b, v28.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x6e9c9736  // udot v22.4s, v25.16b, v28.16b\n"
+    "ldr q28, [SP, #0x50]\n"
+    ".inst 0x6e989733  // udot v19.4s, v25.16b, v24.16b\n"
+    "ldr q25, [%x[params], #0xf0]\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x6e88956a  // udot v10.4s, v11.16b, v8.16b\n"
+    "ldr q8, [SP, #0x10]\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x6e989571  // udot v17.4s, v11.16b, v24.16b\n"
+    "ldr q24, [SP, #0x70]\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "movi v15.4s, #0x0\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    ".inst 0x6e83956f  // udot v15.4s, v11.16b, v3.16b\n"
+    "movi v10.4s, #0x0\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x100]\n"
+    ".inst 0x6e9f956f  // udot v15.4s, v11.16b, v31.16b\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "mov v17.16b, v15.16b\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0x110]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x25, x23]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "ldr q30, [%x[params], #0xc0]\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "str s20, [x22, x23]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x23]\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    ".inst 0x6e86956f  // udot v15.4s, v11.16b, v6.16b\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    ".inst 0x6e8397b4  // udot v20.4s, v29.16b, v3.16b\n"
+    ".inst 0x6e9a9571  // udot v17.4s, v11.16b, v26.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x21, x23]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x23, x23, #0x4\n"
+    ".inst 0x6e8697be  // udot v30.4s, v29.16b, v6.16b\n"
+    ".inst 0x6e9f9774  // udot v20.4s, v27.16b, v31.16b\n"
+    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+    ".inst 0x6e83977e  // udot v30.4s, v27.16b, v3.16b\n"
+    ".inst 0x6e9a9734  // udot v20.4s, v25.16b, v26.16b\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    ".inst 0x6e9f973e  // udot v30.4s, v25.16b, v31.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x6e8697b6  // udot v22.4s, v29.16b, v6.16b\n"
+    ".inst 0x6e8397b3  // udot v19.4s, v29.16b, v3.16b\n"
+    "ldr q29, [%x[params], #0x130]\n"
+    ".inst 0x6e83956a  // udot v10.4s, v11.16b, v3.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x6e839776  // udot v22.4s, v27.16b, v3.16b\n"
+    ".inst 0x6e9f9773  // udot v19.4s, v27.16b, v31.16b\n"
+    "ldr q27, [%x[params], #0x140]\n"
+    ".inst 0x6e9f956a  // udot v10.4s, v11.16b, v31.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x6e9f9736  // udot v22.4s, v25.16b, v31.16b\n"
+    ".inst 0x6e9a9733  // udot v19.4s, v25.16b, v26.16b\n"
+    "ldr q25, [%x[params], #0x150]\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x6e86956a  // udot v10.4s, v11.16b, v6.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x6e9a9571  // udot v17.4s, v11.16b, v26.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "movi v15.4s, #0x0\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    ".inst 0x6e82956f  // udot v15.4s, v11.16b, v2.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "movi v10.4s, #0x0\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    ".inst 0x6e9c956f  // udot v15.4s, v11.16b, v28.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "ldr q23, [%x[params], #0x160]\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "mov v17.16b, v15.16b\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "ldr q21, [%x[params], #0x170]\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x25, x23]\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "ldr q30, [%x[params], #0x120]\n"
+    "add %x[params], %x[params], #0x180\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x22, x23]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    ".inst 0x6e88956f  // udot v15.4s, v11.16b, v8.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x23]\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    ".inst 0x6e8297b4  // udot v20.4s, v29.16b, v2.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    ".inst 0x6e989571  // udot v17.4s, v11.16b, v24.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x21, x23]\n"
+    "mov v19.16b, v30.16b\n"
+    "add x23, x23, #0x4\n"
+    ".inst 0x6e8897be  // udot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x6e9c9774  // udot v20.4s, v27.16b, v28.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    ".inst 0x6e82977e  // udot v30.4s, v27.16b, v2.16b\n"
+    ".inst 0x6e989734  // udot v20.4s, v25.16b, v24.16b\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    ".inst 0x6e9c973e  // udot v30.4s, v25.16b, v28.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e8897b6  // udot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x6e8297b3  // udot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x6e82956a  // udot v10.4s, v11.16b, v2.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    ".inst 0x6e829776  // udot v22.4s, v27.16b, v2.16b\n"
+    ".inst 0x6e9c9773  // udot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x6e9c956a  // udot v10.4s, v11.16b, v28.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x6e9c9736  // udot v22.4s, v25.16b, v28.16b\n"
+    ".inst 0x6e989733  // udot v19.4s, v25.16b, v24.16b\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x6e88956a  // udot v10.4s, v11.16b, v8.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x6e989571  // udot v17.4s, v11.16b, v24.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x25, x23]\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x24, x23]\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x22, x23]\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x21, x23]\n"
+    "add x23, x23, #0x4\n"
+    "bgt 1b\n"
+    "tst %x[n_channels], #0xf\n"
+    "beq 34f\n"
+    "2:"  // Oddments
+    "and x19, %x[n_channels], #0xf\n"
+    "add x13, x13, x23\n"
+    "add x12, x12, x23\n"
+    "add x11, x11, x23\n"
+    "add x10, x10, x23\n"
+    "add x9, x9, x23\n"
+    "add x28, x28, x23\n"
+    "add x27, x27, x23\n"
+    "add x26, x26, x23\n"
+    "tbz %x[n_channels], #3, 6f\n"
+    "ld1 { v27.d }[0], [x13], #0x8\n"
+    "ld1 { v1.d }[0], [x12], #0x8\n"
+    "ld1 { v25.d }[0], [x11], #0x8\n"
+    "ld1 { v23.d }[0], [x10], #0x8\n"
+    "ld1 { v31.d }[0], [x9], #0x8\n"
+    "ld1 { v28.d }[0], [x28], #0x8\n"
+    "ld1 { v21.d }[0], [x27], #0x8\n"
+    "ld1 { v26.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #2, 4f\n"
+    "ld1 { v27.s }[2], [x13], #0x4\n"
+    "ld1 { v1.s }[2], [x12], #0x4\n"
+    "ld1 { v25.s }[2], [x11], #0x4\n"
+    "ld1 { v23.s }[2], [x10], #0x4\n"
+    "ld1 { v31.s }[2], [x9], #0x4\n"
+    "ld1 { v28.s }[2], [x28], #0x4\n"
+    "ld1 { v21.s }[2], [x27], #0x4\n"
+    "ld1 { v26.s }[2], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 3f\n"
+    "ld1 { v27.h }[6], [x13], #0x2\n"
+    "ld1 { v1.h }[6], [x12], #0x2\n"
+    "ld1 { v25.h }[6], [x11], #0x2\n"
+    "ld1 { v23.h }[6], [x10], #0x2\n"
+    "ld1 { v31.h }[6], [x9], #0x2\n"
+    "ld1 { v28.h }[6], [x28], #0x2\n"
+    "ld1 { v21.h }[6], [x27], #0x2\n"
+    "ld1 { v26.h }[6], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[14], [x13], #0x1\n"
+    "ld1 { v1.b }[14], [x12], #0x1\n"
+    "ld1 { v25.b }[14], [x11], #0x1\n"
+    "ld1 { v23.b }[14], [x10], #0x1\n"
+    "ld1 { v31.b }[14], [x9], #0x1\n"
+    "ld1 { v28.b }[14], [x28], #0x1\n"
+    "ld1 { v21.b }[14], [x27], #0x1\n"
+    "ld1 { v26.b }[14], [x26], #0x1\n"
+    "b 10f\n"
+    "3:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[12], [x13], #0x1\n"
+    "ld1 { v1.b }[12], [x12], #0x1\n"
+    "ld1 { v25.b }[12], [x11], #0x1\n"
+    "ld1 { v23.b }[12], [x10], #0x1\n"
+    "ld1 { v31.b }[12], [x9], #0x1\n"
+    "ld1 { v28.b }[12], [x28], #0x1\n"
+    "ld1 { v21.b }[12], [x27], #0x1\n"
+    "ld1 { v26.b }[12], [x26], #0x1\n"
+    "b 10f\n"
+    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 5f\n"
+    "ld1 { v27.h }[4], [x13], #0x2\n"
+    "ld1 { v1.h }[4], [x12], #0x2\n"
+    "ld1 { v25.h }[4], [x11], #0x2\n"
+    "ld1 { v23.h }[4], [x10], #0x2\n"
+    "ld1 { v31.h }[4], [x9], #0x2\n"
+    "ld1 { v28.h }[4], [x28], #0x2\n"
+    "ld1 { v21.h }[4], [x27], #0x2\n"
+    "ld1 { v26.h }[4], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[10], [x13], #0x1\n"
+    "ld1 { v1.b }[10], [x12], #0x1\n"
+    "ld1 { v25.b }[10], [x11], #0x1\n"
+    "ld1 { v23.b }[10], [x10], #0x1\n"
+    "ld1 { v31.b }[10], [x9], #0x1\n"
+    "ld1 { v28.b }[10], [x28], #0x1\n"
+    "ld1 { v21.b }[10], [x27], #0x1\n"
+    "ld1 { v26.b }[10], [x26], #0x1\n"
+    "b 10f\n"
+    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[8], [x13], #0x1\n"
+    "ld1 { v1.b }[8], [x12], #0x1\n"
+    "ld1 { v25.b }[8], [x11], #0x1\n"
+    "ld1 { v23.b }[8], [x10], #0x1\n"
+    "ld1 { v31.b }[8], [x9], #0x1\n"
+    "ld1 { v28.b }[8], [x28], #0x1\n"
+    "ld1 { v21.b }[8], [x27], #0x1\n"
+    "ld1 { v26.b }[8], [x26], #0x1\n"
+    "b 10f\n"
+    "6:"  // Oddments: Load (A): Bit 3: Unset
+    "tbz %x[n_channels], #2, 8f\n"
+    "ld1 { v27.s }[0], [x13], #0x4\n"
+    "ld1 { v1.s }[0], [x12], #0x4\n"
+    "ld1 { v25.s }[0], [x11], #0x4\n"
+    "ld1 { v23.s }[0], [x10], #0x4\n"
+    "ld1 { v31.s }[0], [x9], #0x4\n"
+    "ld1 { v28.s }[0], [x28], #0x4\n"
+    "ld1 { v21.s }[0], [x27], #0x4\n"
+    "ld1 { v26.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v27.h }[2], [x13], #0x2\n"
+    "ld1 { v1.h }[2], [x12], #0x2\n"
+    "ld1 { v25.h }[2], [x11], #0x2\n"
+    "ld1 { v23.h }[2], [x10], #0x2\n"
+    "ld1 { v31.h }[2], [x9], #0x2\n"
+    "ld1 { v28.h }[2], [x28], #0x2\n"
+    "ld1 { v21.h }[2], [x27], #0x2\n"
+    "ld1 { v26.h }[2], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[6], [x13], #0x1\n"
+    "ld1 { v1.b }[6], [x12], #0x1\n"
+    "ld1 { v25.b }[6], [x11], #0x1\n"
+    "ld1 { v23.b }[6], [x10], #0x1\n"
+    "ld1 { v31.b }[6], [x9], #0x1\n"
+    "ld1 { v28.b }[6], [x28], #0x1\n"
+    "ld1 { v21.b }[6], [x27], #0x1\n"
+    "ld1 { v26.b }[6], [x26], #0x1\n"
+    "b 10f\n"
+    "7:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[4], [x13], #0x1\n"
+    "ld1 { v1.b }[4], [x12], #0x1\n"
+    "ld1 { v25.b }[4], [x11], #0x1\n"
+    "ld1 { v23.b }[4], [x10], #0x1\n"
+    "ld1 { v31.b }[4], [x9], #0x1\n"
+    "ld1 { v28.b }[4], [x28], #0x1\n"
+    "ld1 { v21.b }[4], [x27], #0x1\n"
+    "ld1 { v26.b }[4], [x26], #0x1\n"
+    "b 10f\n"
+    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 9f\n"
+    "ld1 { v27.h }[0], [x13], #0x2\n"
+    "ld1 { v1.h }[0], [x12], #0x2\n"
+    "ld1 { v25.h }[0], [x11], #0x2\n"
+    "ld1 { v23.h }[0], [x10], #0x2\n"
+    "ld1 { v31.h }[0], [x9], #0x2\n"
+    "ld1 { v28.h }[0], [x28], #0x2\n"
+    "ld1 { v21.h }[0], [x27], #0x2\n"
+    "ld1 { v26.h }[0], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[2], [x13], #0x1\n"
+    "ld1 { v1.b }[2], [x12], #0x1\n"
+    "ld1 { v25.b }[2], [x11], #0x1\n"
+    "ld1 { v23.b }[2], [x10], #0x1\n"
+    "ld1 { v31.b }[2], [x9], #0x1\n"
+    "ld1 { v28.b }[2], [x28], #0x1\n"
+    "ld1 { v21.b }[2], [x27], #0x1\n"
+    "ld1 { v26.b }[2], [x26], #0x1\n"
+    "b 10f\n"
+    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 10f\n"
+    "ld1 { v27.b }[0], [x13], #0x1\n"
+    "ld1 { v1.b }[0], [x12], #0x1\n"
+    "ld1 { v25.b }[0], [x11], #0x1\n"
+    "ld1 { v23.b }[0], [x10], #0x1\n"
+    "ld1 { v31.b }[0], [x9], #0x1\n"
+    "ld1 { v28.b }[0], [x28], #0x1\n"
+    "ld1 { v21.b }[0], [x27], #0x1\n"
+    "ld1 { v26.b }[0], [x26], #0x1\n"
+    "10:"  // Oddments: Load (A): Bit 3: End
+    "ldp x13, x12, [%x[inptrs], #0x40]\n"
+    "add x13, x13, x23\n"
+    "ldp x11, x10, [%x[inptrs], #0x50]\n"
+    "ldp x9, x28, [%x[inptrs], #0x60]\n"
+    "add x12, x12, x23\n"
+    "ldp x27, x26, [%x[inptrs], #0x70]\n"
+    "add x11, x11, x23\n"
+    "add x10, x10, x23\n"
+    "add x9, x9, x23\n"
+    "add x28, x28, x23\n"
+    "add x27, x27, x23\n"
+    "add x26, x26, x23\n"
+    "tbz %x[n_channels], #3, 14f\n"
+    "ld1 { v24.d }[0], [x13], #0x8\n"
+    "ld1 { v22.d }[0], [x12], #0x8\n"
+    "ld1 { v20.d }[0], [x11], #0x8\n"
+    "ld1 { v16.d }[0], [x10], #0x8\n"
+    "ld1 { v19.d }[0], [x9], #0x8\n"
+    "ld1 { v0.d }[0], [x28], #0x8\n"
+    "ld1 { v18.d }[0], [x27], #0x8\n"
+    "ld1 { v17.d }[0], [x26], #0x8\n"
+    "tbz %x[n_channels], #2, 12f\n"
+    "ld1 { v24.s }[2], [x13], #0x4\n"
+    "ld1 { v22.s }[2], [x12], #0x4\n"
+    "ld1 { v20.s }[2], [x11], #0x4\n"
+    "ld1 { v16.s }[2], [x10], #0x4\n"
+    "ld1 { v19.s }[2], [x9], #0x4\n"
+    "ld1 { v0.s }[2], [x28], #0x4\n"
+    "ld1 { v18.s }[2], [x27], #0x4\n"
+    "ld1 { v17.s }[2], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 11f\n"
+    "ld1 { v24.h }[6], [x13], #0x2\n"
+    "ld1 { v22.h }[6], [x12], #0x2\n"
+    "ld1 { v20.h }[6], [x11], #0x2\n"
+    "ld1 { v16.h }[6], [x10], #0x2\n"
+    "ld1 { v19.h }[6], [x9], #0x2\n"
+    "ld1 { v0.h }[6], [x28], #0x2\n"
+    "ld1 { v18.h }[6], [x27], #0x2\n"
+    "ld1 { v17.h }[6], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[14], [x13], #0x1\n"
+    "ld1 { v22.b }[14], [x12], #0x1\n"
+    "ld1 { v20.b }[14], [x11], #0x1\n"
+    "ld1 { v16.b }[14], [x10], #0x1\n"
+    "ld1 { v19.b }[14], [x9], #0x1\n"
+    "ld1 { v0.b }[14], [x28], #0x1\n"
+    "ld1 { v18.b }[14], [x27], #0x1\n"
+    "ld1 { v17.b }[14], [x26], #0x1\n"
+    "b 18f\n"
+    "11:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[12], [x13], #0x1\n"
+    "ld1 { v22.b }[12], [x12], #0x1\n"
+    "ld1 { v20.b }[12], [x11], #0x1\n"
+    "ld1 { v16.b }[12], [x10], #0x1\n"
+    "ld1 { v19.b }[12], [x9], #0x1\n"
+    "ld1 { v0.b }[12], [x28], #0x1\n"
+    "ld1 { v18.b }[12], [x27], #0x1\n"
+    "ld1 { v17.b }[12], [x26], #0x1\n"
+    "b 18f\n"
+    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
+    "tbz %x[n_channels], #1, 13f\n"
+    "ld1 { v24.h }[4], [x13], #0x2\n"
+    "ld1 { v22.h }[4], [x12], #0x2\n"
+    "ld1 { v20.h }[4], [x11], #0x2\n"
+    "ld1 { v16.h }[4], [x10], #0x2\n"
+    "ld1 { v19.h }[4], [x9], #0x2\n"
+    "ld1 { v0.h }[4], [x28], #0x2\n"
+    "ld1 { v18.h }[4], [x27], #0x2\n"
+    "ld1 { v17.h }[4], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[10], [x13], #0x1\n"
+    "ld1 { v22.b }[10], [x12], #0x1\n"
+    "ld1 { v20.b }[10], [x11], #0x1\n"
+    "ld1 { v16.b }[10], [x10], #0x1\n"
+    "ld1 { v19.b }[10], [x9], #0x1\n"
+    "ld1 { v0.b }[10], [x28], #0x1\n"
+    "ld1 { v18.b }[10], [x27], #0x1\n"
+    "ld1 { v17.b }[10], [x26], #0x1\n"
+    "b 18f\n"
+    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[8], [x13], #0x1\n"
+    "ld1 { v22.b }[8], [x12], #0x1\n"
+    "ld1 { v20.b }[8], [x11], #0x1\n"
+    "ld1 { v16.b }[8], [x10], #0x1\n"
+    "ld1 { v19.b }[8], [x9], #0x1\n"
+    "ld1 { v0.b }[8], [x28], #0x1\n"
+    "ld1 { v18.b }[8], [x27], #0x1\n"
+    "ld1 { v17.b }[8], [x26], #0x1\n"
+    "b 18f\n"
+    "14:"  // Oddments: Load (B): Bit 3: Unset
+    "tbz %x[n_channels], #2, 16f\n"
+    "ld1 { v24.s }[0], [x13], #0x4\n"
+    "ld1 { v22.s }[0], [x12], #0x4\n"
+    "ld1 { v20.s }[0], [x11], #0x4\n"
+    "ld1 { v16.s }[0], [x10], #0x4\n"
+    "ld1 { v19.s }[0], [x9], #0x4\n"
+    "ld1 { v0.s }[0], [x28], #0x4\n"
+    "ld1 { v18.s }[0], [x27], #0x4\n"
+    "ld1 { v17.s }[0], [x26], #0x4\n"
+    "tbz %x[n_channels], #1, 15f\n"
+    "ld1 { v24.h }[2], [x13], #0x2\n"
+    "ld1 { v22.h }[2], [x12], #0x2\n"
+    "ld1 { v20.h }[2], [x11], #0x2\n"
+    "ld1 { v16.h }[2], [x10], #0x2\n"
+    "ld1 { v19.h }[2], [x9], #0x2\n"
+    "ld1 { v0.h }[2], [x28], #0x2\n"
+    "ld1 { v18.h }[2], [x27], #0x2\n"
+    "ld1 { v17.h }[2], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[6], [x13], #0x1\n"
+    "ld1 { v22.b }[6], [x12], #0x1\n"
+    "ld1 { v20.b }[6], [x11], #0x1\n"
+    "ld1 { v16.b }[6], [x10], #0x1\n"
+    "ld1 { v19.b }[6], [x9], #0x1\n"
+    "ld1 { v0.b }[6], [x28], #0x1\n"
+    "ld1 { v18.b }[6], [x27], #0x1\n"
+    "ld1 { v17.b }[6], [x26], #0x1\n"
+    "b 18f\n"
+    "15:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[4], [x13], #0x1\n"
+    "ld1 { v22.b }[4], [x12], #0x1\n"
+    "ld1 { v20.b }[4], [x11], #0x1\n"
+    "ld1 { v16.b }[4], [x10], #0x1\n"
+    "ld1 { v19.b }[4], [x9], #0x1\n"
+    "ld1 { v0.b }[4], [x28], #0x1\n"
+    "ld1 { v18.b }[4], [x27], #0x1\n"
+    "ld1 { v17.b }[4], [x26], #0x1\n"
+    "b 18f\n"
+    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+    "tbz %x[n_channels], #1, 17f\n"
+    "ld1 { v24.h }[0], [x13], #0x2\n"
+    "ld1 { v22.h }[0], [x12], #0x2\n"
+    "ld1 { v20.h }[0], [x11], #0x2\n"
+    "ld1 { v16.h }[0], [x10], #0x2\n"
+    "ld1 { v19.h }[0], [x9], #0x2\n"
+    "ld1 { v0.h }[0], [x28], #0x2\n"
+    "ld1 { v18.h }[0], [x27], #0x2\n"
+    "ld1 { v17.h }[0], [x26], #0x2\n"
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[2], [x13], #0x1\n"
+    "ld1 { v22.b }[2], [x12], #0x1\n"
+    "ld1 { v20.b }[2], [x11], #0x1\n"
+    "ld1 { v16.b }[2], [x10], #0x1\n"
+    "ld1 { v19.b }[2], [x9], #0x1\n"
+    "ld1 { v0.b }[2], [x28], #0x1\n"
+    "ld1 { v18.b }[2], [x27], #0x1\n"
+    "ld1 { v17.b }[2], [x26], #0x1\n"
+    "b 18f\n"
+    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+    "tbz %x[n_channels], #0, 18f\n"
+    "ld1 { v24.b }[0], [x13], #0x1\n"
+    "ld1 { v22.b }[0], [x12], #0x1\n"
+    "ld1 { v20.b }[0], [x11], #0x1\n"
+    "ld1 { v16.b }[0], [x10], #0x1\n"
+    "ld1 { v19.b }[0], [x9], #0x1\n"
+    "ld1 { v0.b }[0], [x28], #0x1\n"
+    "ld1 { v18.b }[0], [x27], #0x1\n"
+    "ld1 { v17.b }[0], [x26], #0x1\n"
+    "18:"  // Oddments: Load (B): Bit 3: End
+    "zip1 v7.16b, v27.16b, v25.16b\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "cmp x19, #0x4\n"
+    "zip2 v5.16b, v27.16b, v25.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "zip1 v8.16b, v1.16b, v23.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "zip2 v3.16b, v1.16b, v23.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "zip1 v2.16b, v31.16b, v21.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "zip2 v4.16b, v31.16b, v21.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "zip1 v1.16b, v28.16b, v26.16b\n"
+    "zip2 v31.16b, v28.16b, v26.16b\n"
+    "zip1 v28.16b, v24.16b, v20.16b\n"
+    "zip2 v26.16b, v24.16b, v20.16b\n"
+    "zip1 v24.16b, v22.16b, v16.16b\n"
+    "zip2 v22.16b, v22.16b, v16.16b\n"
+    "zip1 v20.16b, v19.16b, v18.16b\n"
+    "zip2 v19.16b, v19.16b, v18.16b\n"
+    "zip1 v18.16b, v0.16b, v17.16b\n"
+    "zip2 v17.16b, v0.16b, v17.16b\n"
+    "zip1 v6.16b, v7.16b, v8.16b\n"
+    "zip2 v8.16b, v7.16b, v8.16b\n"
+    "zip1 v7.16b, v5.16b, v3.16b\n"
+    "str q7, [SP, #0x0]\n"
+    "zip2 v5.16b, v5.16b, v3.16b\n"
+    "str q5, [SP, #0x10]\n"
+    "zip1 v3.16b, v2.16b, v1.16b\n"
+    "zip2 v2.16b, v2.16b, v1.16b\n"
+    "zip1 v1.16b, v4.16b, v31.16b\n"
+    "str q1, [SP, #0x20]\n"
+    "zip2 v16.16b, v4.16b, v31.16b\n"
+    "str q16, [SP, #0x30]\n"
+    "zip1 v31.16b, v28.16b, v24.16b\n"
+    "zip2 v28.16b, v28.16b, v24.16b\n"
+    "zip1 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x40]\n"
+    "zip2 v16.16b, v26.16b, v22.16b\n"
+    "str q16, [SP, #0x50]\n"
+    "zip1 v26.16b, v20.16b, v18.16b\n"
+    "zip2 v24.16b, v20.16b, v18.16b\n"
+    "zip1 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x60]\n"
+    "zip2 v16.16b, v19.16b, v17.16b\n"
+    "str q16, [SP, #0x70]\n"
+    "mov v22.16b, v30.16b\n"
+    "mov v20.16b, v30.16b\n"
+    "mov v19.16b, v30.16b\n"
+    ".inst 0x6e8697be  // udot v30.4s, v29.16b, v6.16b\n"
+    ".inst 0x6e8397b4  // udot v20.4s, v29.16b, v3.16b\n"
+    "movi v15.4s, #0x0\n"
+    ".inst 0x6e83956f  // udot v15.4s, v11.16b, v3.16b\n"
+    ".inst 0x6e83977e  // udot v30.4s, v27.16b, v3.16b\n"
+    ".inst 0x6e9f9774  // udot v20.4s, v27.16b, v31.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    ".inst 0x6e9f956f  // udot v15.4s, v11.16b, v31.16b\n"
+    ".inst 0x6e9f973e  // udot v30.4s, v25.16b, v31.16b\n"
+    ".inst 0x6e9a9734  // udot v20.4s, v25.16b, v26.16b\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x6e86956f  // udot v15.4s, v11.16b, v6.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x6e9a9571  // udot v17.4s, v11.16b, v26.16b\n"
+    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x6e8697b6  // udot v22.4s, v29.16b, v6.16b\n"
+    ".inst 0x6e8397b3  // udot v19.4s, v29.16b, v3.16b\n"
+    "movi v10.4s, #0x0\n"
+    ".inst 0x6e83956a  // udot v10.4s, v11.16b, v3.16b\n"
+    ".inst 0x6e839776  // udot v22.4s, v27.16b, v3.16b\n"
+    ".inst 0x6e9f9773  // udot v19.4s, v27.16b, v31.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x6e9f956a  // udot v10.4s, v11.16b, v31.16b\n"
+    ".inst 0x6e9f9736  // udot v22.4s, v25.16b, v31.16b\n"
+    ".inst 0x6e9a9733  // udot v19.4s, v25.16b, v26.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x6e86956a  // udot v10.4s, v11.16b, v6.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x6e9a9571  // udot v17.4s, v11.16b, v26.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 19f\n"
+    "str s30, [x25, x23]\n"
+    "str s22, [x24, x23]\n"
+    "str s20, [x22, x23]\n"
+    "str s19, [x21, x23]\n"
+    "b 22f\n"
+    "19:"  // Oddments: Unroll 0: Oddment store
+    "add x25, x25, x23\n"
+    "add x24, x24, x23\n"
+    "add x22, x22, x23\n"
+    "add x21, x21, x23\n"
+    "tbz x19, #1, 20f\n"
+    "st1 { v30.h }[0], [x25], #0x2\n"
+    "st1 { v22.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x22], #0x2\n"
+    "st1 { v19.h }[0], [x21], #0x2\n"
+    "tbz x19, #0, 21f\n"
+    "st1 { v30.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v20.b }[2], [x22], #0x1\n"
+    "st1 { v19.b }[2], [x21], #0x1\n"
+    "b 21f\n"
+    "20:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 21f\n"
+    "st1 { v30.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v20.b }[0], [x22], #0x1\n"
+    "st1 { v19.b }[0], [x21], #0x1\n"
+    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
+
+    "22:"  // Oddments: Unroll 0: After oddment store
+    "add x23, x23, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "movi v15.4s, #0x0\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    ".inst 0x6e82956f  // udot v15.4s, v11.16b, v2.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "cmp x19, #0x4\n"
+    "movi v10.4s, #0x0\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "mov v19.16b, v30.16b\n"
+    ".inst 0x6e8897be  // udot v30.4s, v29.16b, v8.16b\n"
+    ".inst 0x6e8297b4  // udot v20.4s, v29.16b, v2.16b\n"
+    ".inst 0x6e9c956f  // udot v15.4s, v11.16b, v28.16b\n"
+    ".inst 0x6e82977e  // udot v30.4s, v27.16b, v2.16b\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x6e9c9774  // udot v20.4s, v27.16b, v28.16b\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x6e88956f  // udot v15.4s, v11.16b, v8.16b\n"
+    ".inst 0x6e9c973e  // udot v30.4s, v25.16b, v28.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x6e989734  // udot v20.4s, v25.16b, v24.16b\n"
+    ".inst 0x6e989571  // udot v17.4s, v11.16b, v24.16b\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e8297b3  // udot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x6e82956a  // udot v10.4s, v11.16b, v2.16b\n"
+    ".inst 0x6e8897b6  // udot v22.4s, v29.16b, v8.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x6e9c9773  // udot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x6e9c956a  // udot v10.4s, v11.16b, v28.16b\n"
+    ".inst 0x6e829776  // udot v22.4s, v27.16b, v2.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    ".inst 0x6e9c9736  // udot v22.4s, v25.16b, v28.16b\n"
+    ".inst 0x6e989733  // udot v19.4s, v25.16b, v24.16b\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x6e88956a  // udot v10.4s, v11.16b, v8.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x6e989571  // udot v17.4s, v11.16b, v24.16b\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 23f\n"
+    "str s30, [x25, x23]\n"
+    "str s22, [x24, x23]\n"
+    "str s20, [x22, x23]\n"
+    "str s19, [x21, x23]\n"
+    "b 26f\n"
+    "23:"  // Oddments: Unroll 1: Oddment store
+    "add x25, x25, x23\n"
+    "add x24, x24, x23\n"
+    "add x22, x22, x23\n"
+    "add x21, x21, x23\n"
+    "tbz x19, #1, 24f\n"
+    "st1 { v30.h }[0], [x25], #0x2\n"
+    "st1 { v22.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x22], #0x2\n"
+    "st1 { v19.h }[0], [x21], #0x2\n"
+    "tbz x19, #0, 25f\n"
+    "st1 { v30.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v20.b }[2], [x22], #0x1\n"
+    "st1 { v19.b }[2], [x21], #0x1\n"
+    "b 25f\n"
+    "24:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 25f\n"
+    "st1 { v30.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v20.b }[0], [x22], #0x1\n"
+    "st1 { v19.b }[0], [x21], #0x1\n"
+    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
+
+    "26:"  // Oddments: Unroll 1: After oddment store
+    "add x23, x23, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "movi v15.4s, #0x0\n"
+    "ldr q6, [SP, #0x0]\n"
+    "movi v10.4s, #0x0\n"
+    "ldr q3, [SP, #0x20]\n"
+    "cmp x19, #0x4\n"
+    ".inst 0x6e83956f  // udot v15.4s, v11.16b, v3.16b\n"
+    "ldr q31, [SP, #0x40]\n"
+    "ldr q26, [SP, #0x60]\n"
+    ".inst 0x6e9f956f  // udot v15.4s, v11.16b, v31.16b\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    "mov v19.16b, v30.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    ".inst 0x6e8697be  // udot v30.4s, v29.16b, v6.16b\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x6e8397b4  // udot v20.4s, v29.16b, v3.16b\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x6e86956f  // udot v15.4s, v11.16b, v6.16b\n"
+    ".inst 0x6e83977e  // udot v30.4s, v27.16b, v3.16b\n"
+    ".inst 0x6e9a9571  // udot v17.4s, v11.16b, v26.16b\n"
+    ".inst 0x6e9f9774  // udot v20.4s, v27.16b, v31.16b\n"
+    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+    ".inst 0x6e9f973e  // udot v30.4s, v25.16b, v31.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x6e9a9734  // udot v20.4s, v25.16b, v26.16b\n"
+    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+    ".inst 0x6e8697b6  // udot v22.4s, v29.16b, v6.16b\n"
+    ".inst 0x6e8397b3  // udot v19.4s, v29.16b, v3.16b\n"
+    ".inst 0x6e83956a  // udot v10.4s, v11.16b, v3.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x6e839776  // udot v22.4s, v27.16b, v3.16b\n"
+    ".inst 0x6e9f9773  // udot v19.4s, v27.16b, v31.16b\n"
+    ".inst 0x6e9f956a  // udot v10.4s, v11.16b, v31.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    ".inst 0x6e9f9736  // udot v22.4s, v25.16b, v31.16b\n"
+    ".inst 0x6e9a9733  // udot v19.4s, v25.16b, v26.16b\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x6e86956a  // udot v10.4s, v11.16b, v6.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x6e9a9571  // udot v17.4s, v11.16b, v26.16b\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "blt 27f\n"
+    "str s30, [x25, x23]\n"
+    "str s22, [x24, x23]\n"
+    "str s20, [x22, x23]\n"
+    "str s19, [x21, x23]\n"
+    "b 30f\n"
+    "27:"  // Oddments: Unroll 2: Oddment store
+    "add x25, x25, x23\n"
+    "add x24, x24, x23\n"
+    "add x22, x22, x23\n"
+    "add x21, x21, x23\n"
+    "tbz x19, #1, 28f\n"
+    "st1 { v30.h }[0], [x25], #0x2\n"
+    "st1 { v22.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x22], #0x2\n"
+    "st1 { v19.h }[0], [x21], #0x2\n"
+    "tbz x19, #0, 29f\n"
+    "st1 { v30.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v20.b }[2], [x22], #0x1\n"
+    "st1 { v19.b }[2], [x21], #0x1\n"
+    "b 29f\n"
+    "28:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 29f\n"
+    "st1 { v30.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v20.b }[0], [x22], #0x1\n"
+    "st1 { v19.b }[0], [x21], #0x1\n"
+    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
+
+    "30:"  // Oddments: Unroll 2: After oddment store
+    "add x23, x23, #0x4\n"
+    "subs x19, x19, #0x4\n"
+    "ble 34f\n"
+    "movi v15.4s, #0x0\n"
+    "ldr q8, [SP, #0x10]\n"
+    "movi v10.4s, #0x0\n"
+    "ldr q2, [SP, #0x30]\n"
+    "ldr q28, [SP, #0x50]\n"
+    ".inst 0x6e82956f  // udot v15.4s, v11.16b, v2.16b\n"
+    "ldr q24, [SP, #0x70]\n"
+    "ldr q30, [%x[params], #0x0]\n"
+    "mov v22.16b, v30.16b\n"
+    "ldr q29, [%x[params], #0x10]\n"
+    "mov v20.16b, v30.16b\n"
+    "ldr q27, [%x[params], #0x20]\n"
+    "mov v19.16b, v30.16b\n"
+    "ldr q25, [%x[params], #0x30]\n"
+    ".inst 0x6e9c956f  // udot v15.4s, v11.16b, v28.16b\n"
+    "ldr q23, [%x[params], #0x40]\n"
+    "ldr q21, [%x[params], #0x50]\n"
+    ".inst 0x6e8897be  // udot v30.4s, v29.16b, v8.16b\n"
+    "add %x[params], %x[params], #0x60\n"
+    ".inst 0x6e8297b4  // udot v20.4s, v29.16b, v2.16b\n"
+    "mov v17.16b, v15.16b\n"
+    ".inst 0x6e88956f  // udot v15.4s, v11.16b, v8.16b\n"
+    ".inst 0x6e989571  // udot v17.4s, v11.16b, v24.16b\n"
+    ".inst 0x6e82977e  // udot v30.4s, v27.16b, v2.16b\n"
+    ".inst 0x6e9c9774  // udot v20.4s, v27.16b, v28.16b\n"
+    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+    ".inst 0x6e9c973e  // udot v30.4s, v25.16b, v28.16b\n"
+    "mls v30.4s, v15.4s, v14.4s\n"
+    ".inst 0x6e989734  // udot v20.4s, v25.16b, v24.16b\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+    "mls v20.4s, v17.4s, v14.4s\n"
+    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+    ".inst 0x6e8897b6  // udot v22.4s, v29.16b, v8.16b\n"
+    ".inst 0x6e8297b3  // udot v19.4s, v29.16b, v2.16b\n"
+    ".inst 0x6e82956a  // udot v10.4s, v11.16b, v2.16b\n"
+    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+    ".inst 0x6e829776  // udot v22.4s, v27.16b, v2.16b\n"
+    ".inst 0x6e9c9773  // udot v19.4s, v27.16b, v28.16b\n"
+    ".inst 0x6e9c956a  // udot v10.4s, v11.16b, v28.16b\n"
+    "and v18.16b, v30.16b, v21.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    ".inst 0x6e9c9736  // udot v22.4s, v25.16b, v28.16b\n"
+    ".inst 0x6e989733  // udot v19.4s, v25.16b, v24.16b\n"
+    "mov v17.16b, v10.16b\n"
+    ".inst 0x6e88956a  // udot v10.4s, v11.16b, v8.16b\n"
+    "mls v22.4s, v10.4s, v14.4s\n"
+    ".inst 0x6e989571  // udot v17.4s, v11.16b, v24.16b\n"
+    "sqadd v30.4s, v30.4s, v18.4s\n"
+    "mls v19.4s, v17.4s, v14.4s\n"
+    "srshl v30.4s, v30.4s, v21.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
+    "add v30.4s, v30.4s, v13.4s\n"
+    "and v16.16b, v20.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v30.4s, v30.4s, v9.4s\n"
+    "and v17.16b, v22.16b, v21.16b\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v12.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "and v16.16b, v19.16b, v21.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "srshl v20.4s, v20.4s, v21.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "srshl v22.4s, v22.4s, v21.4s\n"
+    "add v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v9.4s\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v21.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v9.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v22.4s, v22.4s, v12.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smax v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "31:"  // Oddments: Unroll 3: Oddment store
+    "add x25, x25, x23\n"
+    "add x24, x24, x23\n"
+    "add x22, x22, x23\n"
+    "add x21, x21, x23\n"
+    "tbz x19, #1, 32f\n"
+    "st1 { v30.h }[0], [x25], #0x2\n"
+    "st1 { v22.h }[0], [x24], #0x2\n"
+    "st1 { v20.h }[0], [x22], #0x2\n"
+    "st1 { v19.h }[0], [x21], #0x2\n"
+    "tbz x19, #0, 33f\n"
+    "st1 { v30.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v20.b }[2], [x22], #0x1\n"
+    "st1 { v19.b }[2], [x21], #0x1\n"
+    "b 33f\n"
+    "32:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+    "tbz x19, #0, 33f\n"
+    "st1 { v30.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v20.b }[0], [x22], #0x1\n"
+    "st1 { v19.b }[0], [x21], #0x1\n"
+    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
+
+    "34:"  // End
+    "add SP, SP, #0x80\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..09ba75f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+  a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..14e113b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,1192 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const uint8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const uint8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x17, #0x0\n"
+    "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x15, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x12, x8, #0x3\n"
+    "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v14.16b }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v9.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v15.4s }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v24.4s }, [x20]\n"
+    "ld1r { v12.4s }, [x19]\n"
+    "ldp x10, x9, [x21, #0x0]\n"
+    "ldp x28, x27, [x21, #0x10]\n"
+    "cbz x12, 3f\n"
+    "subs x12, x12, #0x1\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q13, [x19, #0x0]\n"
+    "mov v17.16b, v13.16b\n"
+    "ldr q19, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v16.16b, v13.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "usubl v0.8h, v0.8b, v9.8b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v21.16b, v19.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "usubl v1.8h, v1.8b, v9.8b\n"
+    "mov v20.16b, v19.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "ldr d4, [x16, #0x20]\n"
+    "usubl v2.8h, v2.8b, v9.8b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "usubl v3.8h, v3.8b, v9.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "ldr d7, [x16, #0x38]\n"
+    "usubl v4.8h, v4.8b, v9.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "usubl v5.8h, v5.8b, v9.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "usubl v6.8h, v6.8b, v9.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "usubl v7.8h, v7.8b, v9.8b\n"
+    "usubl v8.8h, v8.8b, v9.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "ldr d31, [x23, x17]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "ldr d30, [x22, x17]\n"
+    "ldr d29, [x21, x17]\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "ldr d28, [x20, x17]\n"
+    "ldr d27, [x19, x17]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v13.4s, v31.4h, v4.4h\n"
+    "ldr x21, [x14, #0x28]\n"
+    "add x16, x16, #0x48\n"
+    "smlal2 v19.4s, v31.8h, v4.8h\n"
+    "ldr x20, [x14, #0x30]\n"
+    "subs x12, x12, #0x1\n"
+    "smlal v17.4s, v31.4h, v3.4h\n"
+    "ldr x26, [x14, #0x38]\n"
+    "smlal2 v25.4s, v31.8h, v3.8h\n"
+    "ldr x25, [x14, #0x40]\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "ldr x19, [x14, #0x48]\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "ldr x24, [x14, #0x50]\n"
+    "smlal v23.4s, v31.4h, v0.4h\n"
+    "ldr x23, [x14, #0x58]\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x21, x17]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "ldr x22, [x14, #0x60]\n"
+    "smlal2 v19.4s, v30.8h, v0.8h\n"
+    "ldr d30, [x19, x17]\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "ldr x21, [x14, #0x68]\n"
+    "smlal2 v25.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v13.4s, v28.4h, v5.4h\n"
+    "ldr x20, [x14, #0x70]\n"
+    "smlal2 v19.4s, v28.8h, v5.8h\n"
+    "ldr x19, [x14, #0x78]\n"
+    "smlal v17.4s, v28.4h, v4.4h\n"
+    "ldr q26, [x13, #0x0]\n"
+    "smlal2 v25.4s, v28.8h, v4.8h\n"
+    "ldr q10, [x11, #0x0]\n"
+    "smlal v16.4s, v28.4h, v2.4h\n"
+    "ldr q11, [x13, #0x10]\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v21.4s, v28.8h, v2.8h\n"
+    "ldr q18, [x11, #0x10]\n"
+    "add x11, x11, #0x20\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x26, x17]\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v16.4s, v31.4h, v6.4h\n"
+    "smlal2 v21.4s, v31.8h, v6.8h\n"
+    "ldr d31, [x25, x17]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v27.4h, v7.4h\n"
+    "smlal2 v19.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v27.4h, v6.4h\n"
+    "smlal2 v25.4s, v27.8h, v6.8h\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "smlal v23.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "smlal v13.4s, v28.4h, v1.4h\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "smlal v23.4s, v29.4h, v8.4h\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x24, x17]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v0.4h\n"
+    "smlal2 v25.4s, v28.8h, v0.8h\n"
+    "ldr d28, [x23, x17]\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v2.4h\n"
+    "smlal2 v19.4s, v31.8h, v2.8h\n"
+    "smlal v17.4s, v31.4h, v1.4h\n"
+    "smlal2 v25.4s, v31.8h, v1.8h\n"
+    "ldr d31, [x22, x17]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v8.4h\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "smlal v17.4s, v30.4h, v7.4h\n"
+    "smlal2 v25.4s, v30.8h, v7.8h\n"
+    "smlal v16.4s, v30.4h, v5.4h\n"
+    "smlal2 v21.4s, v30.8h, v5.8h\n"
+    "smlal v23.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x17]\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v13.4s, v29.4h, v3.4h\n"
+    "smlal2 v19.4s, v29.8h, v3.8h\n"
+    "smlal v16.4s, v29.4h, v0.4h\n"
+    "smlal2 v21.4s, v29.8h, v0.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v5.4h\n"
+    "smlal2 v25.4s, v28.8h, v5.8h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v20.4s, v28.8h, v2.8h\n"
+    "ldr d28, [x19, x17]\n"
+    "add x17, x17, #0x8\n"
+    "smlal v13.4s, v31.4h, v6.4h\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal2 v19.4s, v31.8h, v6.8h\n"
+    "smlal v16.4s, v31.4h, v3.4h\n"
+    "smlal2 v21.4s, v31.8h, v3.8h\n"
+    "smlal v17.4s, v30.4h, v8.4h\n"
+    "smlal2 v25.4s, v30.8h, v8.8h\n"
+    "smlal v23.4s, v30.4h, v5.4h\n"
+    "smlal2 v20.4s, v30.8h, v5.8h\n"
+    "smlal v16.4s, v29.4h, v7.4h\n"
+    "smlal2 v21.4s, v29.8h, v7.8h\n"
+    "smlal v23.4s, v29.4h, v6.4h\n"
+    "smlal2 v20.4s, v29.8h, v6.8h\n"
+    "smlal v16.4s, v28.4h, v8.4h\n"
+    "smlal2 v21.4s, v28.8h, v8.8h\n"
+    "smlal v23.4s, v28.4h, v7.4h\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+    "and v22.16b, v13.16b, v10.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v18.16b\n"
+    "and v3.16b, v17.16b, v10.16b\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v6.16b, v25.16b, v18.16b\n"
+    "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sqadd v13.4s, v13.4s, v22.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+    "and v0.16b, v16.16b, v10.16b\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v10.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "sqadd v17.4s, v17.4s, v3.4s\n"
+    "sqadd v25.4s, v25.4s, v6.4s\n"
+    "and v29.16b, v21.16b, v18.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "add v13.4s, v13.4s, v15.4s\n"
+    "srshl v19.4s, v19.4s, v18.4s\n"
+    "srshl v17.4s, v17.4s, v10.4s\n"
+    "srshl v25.4s, v25.4s, v18.4s\n"
+    "smin v13.4s, v13.4s, v12.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "add v17.4s, v17.4s, v15.4s\n"
+    "smax v13.4s, v13.4s, v24.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "smin v17.4s, v17.4s, v12.4s\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v24.4s\n"
+    "smax v17.4s, v17.4s, v24.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "uzp1 v13.16b, v13.16b, v19.16b\n"
+    "sqadd v16.4s, v16.4s, v0.4s\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x10, x15]\n"
+    "smax v25.4s, v25.4s, v24.4s\n"
+    "sqadd v21.4s, v21.4s, v29.4s\n"
+    "srshl v16.4s, v16.4s, v10.4s\n"
+    "and v3.16b, v23.16b, v10.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "uzp1 v17.16b, v17.16b, v25.16b\n"
+    "add v16.4s, v16.4s, v15.4s\n"
+    "srshl v21.4s, v21.4s, v18.4s\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "str d17, [x9, x15]\n"
+    "smin v16.4s, v16.4s, v12.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "sqadd v23.4s, v23.4s, v3.4s\n"
+    "smax v16.4s, v16.4s, v24.4s\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "and v25.16b, v20.16b, v18.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "smax v21.4s, v21.4s, v24.4s\n"
+    "srshl v23.4s, v23.4s, v10.4s\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x28, x15]\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "smax v23.4s, v23.4s, v24.4s\n"
+    "srshl v20.4s, v20.4s, v18.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v24.4s\n"
+    "uzp1 v23.16b, v23.16b, v20.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d23, [x27, x15]\n"
+    "add x15, x15, #0x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q13, [x19, #0x0]\n"
+    "mov v17.16b, v13.16b\n"
+    "ldr q19, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v16.16b, v13.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "usubl v0.8h, v0.8b, v9.8b\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v21.16b, v19.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "usubl v1.8h, v1.8b, v9.8b\n"
+    "mov v20.16b, v19.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "ldr d4, [x16, #0x20]\n"
+    "usubl v2.8h, v2.8b, v9.8b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "usubl v3.8h, v3.8b, v9.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "ldr d7, [x16, #0x38]\n"
+    "usubl v4.8h, v4.8b, v9.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "usubl v5.8h, v5.8b, v9.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "usubl v6.8h, v6.8b, v9.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "usubl v7.8h, v7.8b, v9.8b\n"
+    "usubl v8.8h, v8.8b, v9.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "ldr d31, [x23, x17]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "ldr d30, [x22, x17]\n"
+    "ldr d29, [x21, x17]\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "ldr d28, [x20, x17]\n"
+    "ldr d27, [x19, x17]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v13.4s, v31.4h, v4.4h\n"
+    "ldr x21, [x14, #0x28]\n"
+    "tst x8, #0x7\n"
+    "smlal2 v19.4s, v31.8h, v4.8h\n"
+    "ldr x20, [x14, #0x30]\n"
+    "smlal v17.4s, v31.4h, v3.4h\n"
+    "ldr x26, [x14, #0x38]\n"
+    "smlal2 v25.4s, v31.8h, v3.8h\n"
+    "ldr x25, [x14, #0x40]\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "ldr x19, [x14, #0x48]\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "ldr x24, [x14, #0x50]\n"
+    "smlal v23.4s, v31.4h, v0.4h\n"
+    "ldr x23, [x14, #0x58]\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x21, x17]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "ldr x22, [x14, #0x60]\n"
+    "smlal2 v19.4s, v30.8h, v0.8h\n"
+    "ldr d30, [x19, x17]\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "ldr x21, [x14, #0x68]\n"
+    "smlal2 v25.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v13.4s, v28.4h, v5.4h\n"
+    "ldr x20, [x14, #0x70]\n"
+    "smlal2 v19.4s, v28.8h, v5.8h\n"
+    "ldr x19, [x14, #0x78]\n"
+    "smlal v17.4s, v28.4h, v4.4h\n"
+    "ldr q26, [x13, #0x0]\n"
+    "smlal2 v25.4s, v28.8h, v4.8h\n"
+    "ldr q10, [x11, #0x0]\n"
+    "smlal v16.4s, v28.4h, v2.4h\n"
+    "ldr q11, [x13, #0x10]\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v21.4s, v28.8h, v2.8h\n"
+    "ldr q18, [x11, #0x10]\n"
+    "add x11, x11, #0x20\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x26, x17]\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v16.4s, v31.4h, v6.4h\n"
+    "smlal2 v21.4s, v31.8h, v6.8h\n"
+    "ldr d31, [x25, x17]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v27.4h, v7.4h\n"
+    "smlal2 v19.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v27.4h, v6.4h\n"
+    "smlal2 v25.4s, v27.8h, v6.8h\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "smlal v23.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "smlal v13.4s, v28.4h, v1.4h\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "smlal v23.4s, v29.4h, v8.4h\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x24, x17]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v0.4h\n"
+    "smlal2 v25.4s, v28.8h, v0.8h\n"
+    "ldr d28, [x23, x17]\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v2.4h\n"
+    "smlal2 v19.4s, v31.8h, v2.8h\n"
+    "smlal v17.4s, v31.4h, v1.4h\n"
+    "smlal2 v25.4s, v31.8h, v1.8h\n"
+    "ldr d31, [x22, x17]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v8.4h\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "smlal v17.4s, v30.4h, v7.4h\n"
+    "smlal2 v25.4s, v30.8h, v7.8h\n"
+    "smlal v16.4s, v30.4h, v5.4h\n"
+    "smlal2 v21.4s, v30.8h, v5.8h\n"
+    "smlal v23.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x17]\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v13.4s, v29.4h, v3.4h\n"
+    "smlal2 v19.4s, v29.8h, v3.8h\n"
+    "smlal v16.4s, v29.4h, v0.4h\n"
+    "smlal2 v21.4s, v29.8h, v0.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v5.4h\n"
+    "smlal2 v25.4s, v28.8h, v5.8h\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v20.4s, v28.8h, v2.8h\n"
+    "ldr d28, [x19, x17]\n"
+    "add x17, x17, #0x8\n"
+    "smlal v13.4s, v31.4h, v6.4h\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal2 v19.4s, v31.8h, v6.8h\n"
+    "smlal v16.4s, v31.4h, v3.4h\n"
+    "smlal2 v21.4s, v31.8h, v3.8h\n"
+    "smlal v17.4s, v30.4h, v8.4h\n"
+    "smlal2 v25.4s, v30.8h, v8.8h\n"
+    "smlal v23.4s, v30.4h, v5.4h\n"
+    "smlal2 v20.4s, v30.8h, v5.8h\n"
+    "smlal v16.4s, v29.4h, v7.4h\n"
+    "smlal2 v21.4s, v29.8h, v7.8h\n"
+    "smlal v23.4s, v29.4h, v6.4h\n"
+    "smlal2 v20.4s, v29.8h, v6.8h\n"
+    "smlal v16.4s, v28.4h, v8.4h\n"
+    "smlal2 v21.4s, v28.8h, v8.8h\n"
+    "smlal v23.4s, v28.4h, v7.4h\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+    "and v22.16b, v13.16b, v10.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v18.16b\n"
+    "and v3.16b, v17.16b, v10.16b\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v6.16b, v25.16b, v18.16b\n"
+    "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sqadd v13.4s, v13.4s, v22.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+    "and v0.16b, v16.16b, v10.16b\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v13.4s, v13.4s, v10.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "sqadd v17.4s, v17.4s, v3.4s\n"
+    "sqadd v25.4s, v25.4s, v6.4s\n"
+    "and v29.16b, v21.16b, v18.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "add v13.4s, v13.4s, v15.4s\n"
+    "srshl v19.4s, v19.4s, v18.4s\n"
+    "srshl v17.4s, v17.4s, v10.4s\n"
+    "srshl v25.4s, v25.4s, v18.4s\n"
+    "smin v13.4s, v13.4s, v12.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "add v17.4s, v17.4s, v15.4s\n"
+    "smax v13.4s, v13.4s, v24.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "smin v17.4s, v17.4s, v12.4s\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "smax v19.4s, v19.4s, v24.4s\n"
+    "smax v17.4s, v17.4s, v24.4s\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "uzp1 v13.16b, v13.16b, v19.16b\n"
+    "sqadd v16.4s, v16.4s, v0.4s\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x10, x15]\n"
+    "smax v25.4s, v25.4s, v24.4s\n"
+    "sqadd v21.4s, v21.4s, v29.4s\n"
+    "srshl v16.4s, v16.4s, v10.4s\n"
+    "and v3.16b, v23.16b, v10.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "uzp1 v17.16b, v17.16b, v25.16b\n"
+    "add v16.4s, v16.4s, v15.4s\n"
+    "srshl v21.4s, v21.4s, v18.4s\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "str d17, [x9, x15]\n"
+    "smin v16.4s, v16.4s, v12.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "sqadd v23.4s, v23.4s, v3.4s\n"
+    "smax v16.4s, v16.4s, v24.4s\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "and v25.16b, v20.16b, v18.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "smax v21.4s, v21.4s, v24.4s\n"
+    "srshl v23.4s, v23.4s, v10.4s\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x28, x15]\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "smax v23.4s, v23.4s, v24.4s\n"
+    "srshl v20.4s, v20.4s, v18.4s\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v24.4s\n"
+    "uzp1 v23.16b, v23.16b, v20.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d23, [x27, x15]\n"
+    "add x15, x15, #0x8\n"
+    "beq 64f\n"
+    "add x16, x16, #0x48\n"
+    "3:"  // Oddments
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x8, #2, 5f\n"
+    "ld1 { v13.4s }, [x19], #0x10\n"
+    "tbz x8, #1, 4f\n"
+    "ld1 { v19.d }[0], [x19], #0x8\n"
+    "tbz x8, #0, 7f\n"
+    "ld1 { v19.s }[2], [x19]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 7f\n"
+    "ld1 { v19.s }[0], [x19]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x8, #1, 6f\n"
+    "ld1 { v13.d }[0], [x19], #0x8\n"
+    "tbz x8, #0, 7f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 7f\n"
+    "ld1 { v13.s }[0], [x19]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v17.16b, v13.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "mov v25.16b, v19.16b\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v16.16b, v13.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "mov v21.16b, v19.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "mov v23.16b, v13.16b\n"
+    "ldr d4, [x16, #0x20]\n"
+    "usubl v0.8h, v0.8b, v9.8b\n"
+    "mov v20.16b, v19.16b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "usubl v1.8h, v1.8b, v9.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "usubl v2.8h, v2.8b, v9.8b\n"
+    "ldr d7, [x16, #0x38]\n"
+    "usubl v3.8h, v3.8b, v9.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "usubl v4.8h, v4.8b, v9.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "usubl v5.8h, v5.8b, v9.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "usubl v6.8h, v6.8b, v9.8b\n"
+    "usubl v7.8h, v7.8b, v9.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "usubl v8.8h, v8.8b, v9.8b\n"
+    "add x23, x23, x17\n"
+    "add x22, x22, x17\n"
+    "add x21, x21, x17\n"
+    "add x20, x20, x17\n"
+    "add x19, x19, x17\n"
+    "tbz x8, #2, 9f\n"
+    "ld1 { v31.s }[0], [x23], #0x4\n"
+    "ld1 { v30.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "ld1 { v27.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 8f\n"
+    "ld1 { v31.h }[2], [x23], #0x2\n"
+    "ld1 { v30.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "ld1 { v27.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[6], [x23]\n"
+    "ld1 { v30.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "ld1 { v27.b }[6], [x19]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[4], [x23]\n"
+    "ld1 { v30.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "ld1 { v27.b }[4], [x19]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x8, #1, 10f\n"
+    "ld1 { v31.h }[0], [x23], #0x2\n"
+    "ld1 { v30.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "ld1 { v27.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[2], [x23]\n"
+    "ld1 { v30.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "ld1 { v27.b }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[0], [x23]\n"
+    "ld1 { v30.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "ld1 { v27.b }[0], [x19]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ldr x21, [x14, #0x28]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v4.4h\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "smlal2 v19.4s, v31.8h, v4.8h\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v17.4s, v31.4h, v3.4h\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal2 v25.4s, v31.8h, v3.8h\n"
+    "usubl v27.8h, v27.8b, v14.8b\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "add x21, x21, x17\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "smlal v23.4s, v31.4h, v0.4h\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "smlal2 v19.4s, v30.8h, v0.8h\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "smlal2 v25.4s, v29.8h, v2.8h\n"
+    "smlal v13.4s, v28.4h, v5.4h\n"
+    "smlal2 v19.4s, v28.8h, v5.8h\n"
+    "smlal v17.4s, v28.4h, v4.4h\n"
+    "smlal2 v25.4s, v28.8h, v4.8h\n"
+    "smlal v16.4s, v28.4h, v2.4h\n"
+    "smlal2 v21.4s, v28.8h, v2.8h\n"
+    "smlal v23.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "tbz x8, #2, 13f\n"
+    "ld1 { v31.s }[0], [x21], #0x4\n"
+    "tbz x8, #1, 12f\n"
+    "ld1 { v31.h }[2], [x21], #0x2\n"
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[6], [x21]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[4], [x21]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x8, #1, 14f\n"
+    "ld1 { v31.h }[0], [x21], #0x2\n"
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[2], [x21]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[0], [x21]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "smlal v13.4s, v27.4h, v7.4h\n"
+    "ldr x20, [x14, #0x30]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal2 v19.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v27.4h, v6.4h\n"
+    "add x20, x20, x17\n"
+    "smlal2 v25.4s, v27.8h, v6.8h\n"
+    "smlal v23.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "smlal v16.4s, v31.4h, v6.4h\n"
+    "smlal2 v21.4s, v31.8h, v6.8h\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "tbz x8, #2, 17f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x8, #1, 16f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x8, #1, 18f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr x26, [x14, #0x38]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v23.4s, v29.4h, v8.4h\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "add x26, x26, x17\n"
+    "tbz x8, #2, 21f\n"
+    "ld1 { v28.s }[0], [x26], #0x4\n"
+    "tbz x8, #1, 20f\n"
+    "ld1 { v28.h }[2], [x26], #0x2\n"
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[6], [x26]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[4], [x26]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x8, #1, 22f\n"
+    "ld1 { v28.h }[0], [x26], #0x2\n"
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[2], [x26]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[0], [x26]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "ldr x25, [x14, #0x40]\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v13.4s, v28.4h, v1.4h\n"
+    "smlal2 v19.4s, v28.8h, v1.8h\n"
+    "add x25, x25, x17\n"
+    "smlal v17.4s, v28.4h, v0.4h\n"
+    "smlal2 v25.4s, v28.8h, v0.8h\n"
+    "tbz x8, #2, 25f\n"
+    "ld1 { v31.s }[0], [x25], #0x4\n"
+    "tbz x8, #1, 24f\n"
+    "ld1 { v31.h }[2], [x25], #0x2\n"
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[6], [x25]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[4], [x25]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x8, #1, 26f\n"
+    "ld1 { v31.h }[0], [x25], #0x2\n"
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[2], [x25]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[0], [x25]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "ldr x19, [x14, #0x48]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v2.4h\n"
+    "smlal2 v19.4s, v31.8h, v2.8h\n"
+    "add x19, x19, x17\n"
+    "smlal v17.4s, v31.4h, v1.4h\n"
+    "smlal2 v25.4s, v31.8h, v1.8h\n"
+    "tbz x8, #2, 29f\n"
+    "ld1 { v30.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 28f\n"
+    "ld1 { v30.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[6], [x19]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[4], [x19]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x8, #1, 30f\n"
+    "ld1 { v30.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[2], [x19]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[0], [x19]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr x24, [x14, #0x50]\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v13.4s, v30.4h, v8.4h\n"
+    "smlal2 v19.4s, v30.8h, v8.8h\n"
+    "add x24, x24, x17\n"
+    "smlal v17.4s, v30.4h, v7.4h\n"
+    "smlal2 v25.4s, v30.8h, v7.8h\n"
+    "smlal v16.4s, v30.4h, v5.4h\n"
+    "smlal2 v21.4s, v30.8h, v5.8h\n"
+    "smlal v23.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "tbz x8, #2, 33f\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "tbz x8, #1, 32f\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x8, #1, 34f\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "ldr x23, [x14, #0x58]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v13.4s, v29.4h, v3.4h\n"
+    "smlal2 v19.4s, v29.8h, v3.8h\n"
+    "add x23, x23, x17\n"
+    "smlal v16.4s, v29.4h, v0.4h\n"
+    "smlal2 v21.4s, v29.8h, v0.8h\n"
+    "tbz x8, #2, 37f\n"
+    "ld1 { v28.s }[0], [x23], #0x4\n"
+    "tbz x8, #1, 36f\n"
+    "ld1 { v28.h }[2], [x23], #0x2\n"
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[6], [x23]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[4], [x23]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x8, #1, 38f\n"
+    "ld1 { v28.h }[0], [x23], #0x2\n"
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[2], [x23]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[0], [x23]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "ldr x22, [x14, #0x60]\n"
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v17.4s, v28.4h, v5.4h\n"
+    "smlal2 v25.4s, v28.8h, v5.8h\n"
+    "add x22, x22, x17\n"
+    "smlal v23.4s, v28.4h, v2.4h\n"
+    "smlal2 v20.4s, v28.8h, v2.8h\n"
+    "tbz x8, #2, 41f\n"
+    "ld1 { v31.s }[0], [x22], #0x4\n"
+    "tbz x8, #1, 40f\n"
+    "ld1 { v31.h }[2], [x22], #0x2\n"
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[6], [x22]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[4], [x22]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x8, #1, 42f\n"
+    "ld1 { v31.h }[0], [x22], #0x2\n"
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[2], [x22]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[0], [x22]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "ldr x21, [x14, #0x68]\n"
+    "usubl v31.8h, v31.8b, v14.8b\n"
+    "smlal v13.4s, v31.4h, v6.4h\n"
+    "smlal2 v19.4s, v31.8h, v6.8h\n"
+    "add x21, x21, x17\n"
+    "smlal v16.4s, v31.4h, v3.4h\n"
+    "smlal2 v21.4s, v31.8h, v3.8h\n"
+    "tbz x8, #2, 45f\n"
+    "ld1 { v30.s }[0], [x21], #0x4\n"
+    "tbz x8, #1, 44f\n"
+    "ld1 { v30.h }[2], [x21], #0x2\n"
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[6], [x21]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[4], [x21]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x8, #1, 46f\n"
+    "ld1 { v30.h }[0], [x21], #0x2\n"
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[2], [x21]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[0], [x21]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr x20, [x14, #0x70]\n"
+    "usubl v30.8h, v30.8b, v14.8b\n"
+    "smlal v17.4s, v30.4h, v8.4h\n"
+    "smlal2 v25.4s, v30.8h, v8.8h\n"
+    "add x20, x20, x17\n"
+    "smlal v23.4s, v30.4h, v5.4h\n"
+    "smlal2 v20.4s, v30.8h, v5.8h\n"
+    "tbz x8, #2, 49f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x8, #1, 48f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x8, #1, 50f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr x19, [x14, #0x78]\n"
+    "usubl v29.8h, v29.8b, v14.8b\n"
+    "smlal v16.4s, v29.4h, v7.4h\n"
+    "smlal2 v21.4s, v29.8h, v7.8h\n"
+    "add x19, x19, x17\n"
+    "smlal v23.4s, v29.4h, v6.4h\n"
+    "smlal2 v20.4s, v29.8h, v6.8h\n"
+    "tbz x8, #2, 53f\n"
+    "ld1 { v28.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 52f\n"
+    "ld1 { v28.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[6], [x19]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[4], [x19]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x8, #1, 54f\n"
+    "ld1 { v28.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[2], [x19]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[0], [x19]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v28.8h, v28.8b, v14.8b\n"
+    "smlal v16.4s, v28.4h, v8.4h\n"
+    "smlal2 v21.4s, v28.8h, v8.8h\n"
+    "smlal v23.4s, v28.4h, v7.4h\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "tbz x8, #2, 57f\n"
+    "ld1 { v26.4s }, [x13], #0x10\n"
+    "ld1 { v10.4s }, [x11], #0x10\n"
+    "tbz x8, #1, 56f\n"
+    "ld1 { v11.d }[0], [x13], #0x8\n"
+    "ld1 { v18.d }[0], [x11], #0x8\n"
+    "tbz x8, #0, 59f\n"
+    "ld1 { v11.s }[2], [x13]\n"
+    "ld1 { v18.s }[2], [x11]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 59f\n"
+    "ld1 { v11.s }[0], [x13]\n"
+    "ld1 { v18.s }[0], [x11]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x8, #1, 58f\n"
+    "ld1 { v26.d }[0], [x13], #0x8\n"
+    "ld1 { v10.d }[0], [x11], #0x8\n"
+    "tbz x8, #0, 59f\n"
+    "ld1 { v26.s }[2], [x13]\n"
+    "ld1 { v10.s }[2], [x11]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 59f\n"
+    "ld1 { v26.s }[0], [x13]\n"
+    "ld1 { v10.s }[0], [x11]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+    "add x10, x10, x15\n"
+    "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+    "add x9, x9, x15\n"
+    "sqrdmulh v17.4s, v17.4s, v26.4s\n"
+    "add x28, x28, x15\n"
+    "sqrdmulh v25.4s, v25.4s, v11.4s\n"
+    "add x27, x27, x15\n"
+    "sqrdmulh v16.4s, v16.4s, v26.4s\n"
+    "and v22.16b, v13.16b, v10.16b\n"
+    "sshr v22.4s, v22.4s, #0x1f\n"
+    "and v28.16b, v19.16b, v18.16b\n"
+    "and v3.16b, v17.16b, v10.16b\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "and v6.16b, v25.16b, v18.16b\n"
+    "and v0.16b, v16.16b, v10.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v11.4s\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sqadd v13.4s, v13.4s, v22.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v11.4s\n"
+    "sqadd v19.4s, v19.4s, v28.4s\n"
+    "sqadd v17.4s, v17.4s, v3.4s\n"
+    "srshl v13.4s, v13.4s, v10.4s\n"
+    "sqadd v25.4s, v25.4s, v6.4s\n"
+    "srshl v19.4s, v19.4s, v18.4s\n"
+    "srshl v17.4s, v17.4s, v10.4s\n"
+    "add v13.4s, v13.4s, v15.4s\n"
+    "srshl v25.4s, v25.4s, v18.4s\n"
+    "add v19.4s, v19.4s, v15.4s\n"
+    "smin v13.4s, v13.4s, v12.4s\n"
+    "add v17.4s, v17.4s, v15.4s\n"
+    "smin v19.4s, v19.4s, v12.4s\n"
+    "smax v13.4s, v13.4s, v24.4s\n"
+    "smin v17.4s, v17.4s, v12.4s\n"
+    "smax v19.4s, v19.4s, v24.4s\n"
+    "add v25.4s, v25.4s, v15.4s\n"
+    "smax v17.4s, v17.4s, v24.4s\n"
+    "uzp1 v13.16b, v13.16b, v19.16b\n"
+    "smin v25.4s, v25.4s, v12.4s\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "sqadd v16.4s, v16.4s, v0.4s\n"
+    "smax v25.4s, v25.4s, v24.4s\n"
+    "and v29.16b, v21.16b, v18.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "uzp1 v17.16b, v17.16b, v25.16b\n"
+    "srshl v16.4s, v16.4s, v10.4s\n"
+    "and v3.16b, v23.16b, v10.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "add v16.4s, v16.4s, v15.4s\n"
+    "sqadd v21.4s, v21.4s, v29.4s\n"
+    "and v25.16b, v20.16b, v18.16b\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "smin v16.4s, v16.4s, v12.4s\n"
+    "srshl v21.4s, v21.4s, v18.4s\n"
+    "sqadd v23.4s, v23.4s, v3.4s\n"
+    "smax v16.4s, v16.4s, v24.4s\n"
+    "add v21.4s, v21.4s, v15.4s\n"
+    "srshl v23.4s, v23.4s, v10.4s\n"
+    "sqadd v20.4s, v20.4s, v25.4s\n"
+    "smin v21.4s, v21.4s, v12.4s\n"
+    "add v23.4s, v23.4s, v15.4s\n"
+    "srshl v20.4s, v20.4s, v18.4s\n"
+    "smax v21.4s, v21.4s, v24.4s\n"
+    "smin v23.4s, v23.4s, v12.4s\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v15.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "smax v23.4s, v23.4s, v24.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v24.4s\n"
+    "uzp1 v23.16b, v23.16b, v20.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "tbz x8, #2, 61f\n"
+    "st1 { v13.s }[0], [x10], #0x4\n"
+    "st1 { v17.s }[0], [x9], #0x4\n"
+    "st1 { v16.s }[0], [x28], #0x4\n"
+    "st1 { v23.s }[0], [x27], #0x4\n"
+    "tbz x8, #1, 60f\n"
+    "st1 { v13.h }[2], [x10], #0x2\n"
+    "st1 { v17.h }[2], [x9], #0x2\n"
+    "st1 { v16.h }[2], [x28], #0x2\n"
+    "st1 { v23.h }[2], [x27], #0x2\n"
+    "tbz x8, #0, 63f\n"
+    "st1 { v13.b }[6], [x10], #0x1\n"
+    "st1 { v17.b }[6], [x9], #0x1\n"
+    "st1 { v16.b }[6], [x28], #0x1\n"
+    "st1 { v23.b }[6], [x27], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 63f\n"
+    "st1 { v13.b }[4], [x10], #0x1\n"
+    "st1 { v17.b }[4], [x9], #0x1\n"
+    "st1 { v16.b }[4], [x28], #0x1\n"
+    "st1 { v23.b }[4], [x27], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x8, #1, 62f\n"
+    "st1 { v13.h }[0], [x10], #0x2\n"
+    "st1 { v17.h }[0], [x9], #0x2\n"
+    "st1 { v16.h }[0], [x28], #0x2\n"
+    "st1 { v23.h }[0], [x27], #0x2\n"
+    "tbz x8, #0, 63f\n"
+    "st1 { v13.b }[2], [x10], #0x1\n"
+    "st1 { v17.b }[2], [x9], #0x1\n"
+    "st1 { v16.b }[2], [x28], #0x1\n"
+    "st1 { v23.b }[2], [x27], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 63f\n"
+    "st1 { v13.b }[0], [x10], #0x1\n"
+    "st1 { v17.b }[0], [x9], #0x1\n"
+    "st1 { v16.b }[0], [x28], #0x1\n"
+    "st1 { v23.b }[0], [x27], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+
+    "64:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..44817db
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+  a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..ccdde41
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const uint8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const uint8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x5, #0x0\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x7, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x8, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x16, x4, #0x3\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v12.16b }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v11.4s }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v19.4s }, [x20]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "ldp x12, x11, [x21, #0x10]\n"
+    "cbz x16, 3f\n"
+    "subs x16, x16, #0x1\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x19, #0x0]\n"
+    "mov v20.16b, v15.16b\n"
+    "ldr q10, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v16.16b, v15.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v17.16b, v15.16b\n"
+    "ldr d0, [x6, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "mov v23.16b, v10.16b\n"
+    "ldr d1, [x6, #0x8]\n"
+    "mov v22.16b, v10.16b\n"
+    "ldr d2, [x6, #0x10]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "mov v18.16b, v10.16b\n"
+    "ldr d3, [x6, #0x18]\n"
+    "ldr d4, [x6, #0x20]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr d5, [x6, #0x28]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "ldr d6, [x6, #0x30]\n"
+    "ldr d7, [x6, #0x38]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ldr d8, [x6, #0x40]\n"
+    "usubl v5.8h, v5.8b, v13.8b\n"
+    "ldp x26, x25, [x8, #0x0]\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "ldp x24, x23, [x8, #0x10]\n"
+    "usubl v7.8h, v7.8b, v13.8b\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldp x22, x21, [x8, #0x20]\n"
+    "ldp x20, x19, [x8, #0x30]\n"
+    "ldr d31, [x26, x5]\n"
+    "usubl v31.8h, v31.8b, v12.8b\n"
+    "ldr d30, [x25, x5]\n"
+    "ldr d29, [x24, x5]\n"
+    "usubl v30.8h, v30.8b, v12.8b\n"
+    "ldr d28, [x23, x5]\n"
+    "ldr d27, [x22, x5]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "ldr d26, [x21, x5]\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "ldr d25, [x20, x5]\n"
+    "ldr d24, [x19, x5]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v15.4s, v31.4h, v8.4h\n"
+    "ldr x23, [x8, #0x40]\n"
+    "add x6, x6, #0x48\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "ldr x22, [x8, #0x48]\n"
+    "subs x16, x16, #0x1\n"
+    "smlal v20.4s, v31.4h, v6.4h\n"
+    "ldr x21, [x8, #0x50]\n"
+    "smlal2 v23.4s, v31.8h, v6.8h\n"
+    "ldr x20, [x8, #0x58]\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "ldr x19, [x8, #0x60]\n"
+    "smlal2 v22.4s, v31.8h, v2.8h\n"
+    "ldr x10, [x8, #0x68]\n"
+    "smlal v17.4s, v31.4h, v0.4h\n"
+    "ldr x9, [x8, #0x70]\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x8, #0x78]\n"
+    "smlal v15.4s, v30.4h, v0.4h\n"
+    "ldr x27, [x8, #0x80]\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "ldr x26, [x8, #0x88]\n"
+    "smlal v20.4s, v28.4h, v1.4h\n"
+    "ldr x25, [x8, #0x90]\n"
+    "smlal2 v23.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x5]\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v15.4s, v29.4h, v1.4h\n"
+    "ldr x24, [x8, #0x98]\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "ldr d29, [x23, x5]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v27.4h, v2.4h\n"
+    "ldr x23, [x8, #0xa0]\n"
+    "smlal2 v23.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x5]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v15.4s, v26.4h, v3.4h\n"
+    "ldr x22, [x8, #0xa8]\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x20, x5]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v4.4h\n"
+    "ldr x21, [x8, #0xb0]\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "ldr d25, [x19, x5]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "ldr x20, [x8, #0xb8]\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "ldr x19, [x8, #0xc0]\n"
+    "smlal v20.4s, v24.4h, v0.4h\n"
+    "ldr q21, [x17, #0x0]\n"
+    "smlal2 v23.4s, v24.8h, v0.8h\n"
+    "ldr d24, [x9, x5]\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v4.4h\n"
+    "ldr q30, [x15, #0x0]\n"
+    "smlal2 v23.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x10, x5]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v5.4h\n"
+    "ldr q31, [x17, #0x10]\n"
+    "smlal2 v23.4s, v28.8h, v5.8h\n"
+    "ldr d28, [x27, x5]\n"
+    "add x17, x17, #0x20\n"
+    "smlal v15.4s, v27.4h, v5.4h\n"
+    "ldr q9, [x15, #0x10]\n"
+    "add x15, x15, #0x20\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v20.4s, v27.4h, v3.4h\n"
+    "smlal2 v23.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x28, x5]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v16.4s, v26.4h, v3.4h\n"
+    "smlal2 v22.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x26, x5]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v6.4h\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v22.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x25, x5]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v16.4s, v29.4h, v4.4h\n"
+    "smlal2 v22.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x24, x5]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v7.4h\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v22.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x22, x5]\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v17.4s, v27.4h, v4.4h\n"
+    "smlal2 v18.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x5]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "smlal2 v23.4s, v28.8h, v7.8h\n"
+    "smlal v17.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "smlal v16.4s, v25.4h, v6.4h\n"
+    "smlal2 v22.4s, v25.8h, v6.8h\n"
+    "ldr d25, [x20, x5]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v17.4s, v26.4h, v5.4h\n"
+    "smlal2 v18.4s, v26.8h, v5.8h\n"
+    "ldr d26, [x21, x5]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "smlal2 v23.4s, v29.8h, v8.8h\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "smlal2 v18.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x19, x5]\n"
+    "add x5, x5, #0x8\n"
+    "smlal v16.4s, v27.4h, v7.4h\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal2 v22.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v24.4h, v3.4h\n"
+    "smlal v16.4s, v24.4h, v5.4h\n"
+    "smlal2 v18.4s, v24.8h, v3.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+    "smlal2 v22.4s, v24.8h, v5.8h\n"
+    "smlal v17.4s, v26.4h, v7.4h\n"
+    "smlal2 v18.4s, v26.8h, v7.8h\n"
+    "smlal v16.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "smlal v17.4s, v25.4h, v6.4h\n"
+    "smlal2 v18.4s, v25.8h, v6.8h\n"
+    "and v26.16b, v15.16b, v30.16b\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "smlal v17.4s, v29.4h, v8.4h\n"
+    "smlal2 v18.4s, v29.8h, v8.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+    "sqadd v15.4s, v15.4s, v26.4s\n"
+    "and v8.16b, v10.16b, v9.16b\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "srshl v15.4s, v15.4s, v30.4s\n"
+    "and v4.16b, v20.16b, v30.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v2.16b, v23.16b, v9.16b\n"
+    "and v1.16b, v16.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v11.4s\n"
+    "sqadd v10.4s, v10.4s, v8.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+    "sqadd v20.4s, v20.4s, v4.4s\n"
+    "smin v15.4s, v15.4s, v14.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "smax v15.4s, v15.4s, v19.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "add v10.4s, v10.4s, v11.4s\n"
+    "srshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v1.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "smax v10.4s, v10.4s, v19.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "uzp1 v15.16b, v15.16b, v10.16b\n"
+    "smax v20.4s, v20.4s, v19.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x14, x7]\n"
+    "smax v23.4s, v23.4s, v19.4s\n"
+    "srshl v16.4s, v16.4s, v30.4s\n"
+    "and v24.16b, v22.16b, v9.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "uzp1 v20.16b, v20.16b, v23.16b\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d20, [x13, x7]\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "sqadd v22.4s, v22.4s, v24.4s\n"
+    "and v2.16b, v17.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "smax v16.4s, v16.4s, v19.4s\n"
+    "srshl v22.4s, v22.4s, v9.4s\n"
+    "and v31.16b, v18.16b, v9.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "srshl v17.4s, v17.4s, v30.4s\n"
+    "sqadd v18.4s, v18.4s, v31.4s\n"
+    "smax v22.4s, v22.4s, v19.4s\n"
+    "uzp1 v16.16b, v16.16b, v22.16b\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "srshl v18.4s, v18.4s, v9.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x12, x7]\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "smax v17.4s, v17.4s, v19.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smax v18.4s, v18.4s, v19.4s\n"
+    "uzp1 v17.16b, v17.16b, v18.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "str d17, [x11, x7]\n"
+    "add x7, x7, #0x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x19, #0x0]\n"
+    "mov v20.16b, v15.16b\n"
+    "ldr q10, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v16.16b, v15.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v17.16b, v15.16b\n"
+    "ldr d0, [x6, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "mov v23.16b, v10.16b\n"
+    "ldr d1, [x6, #0x8]\n"
+    "mov v22.16b, v10.16b\n"
+    "ldr d2, [x6, #0x10]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "mov v18.16b, v10.16b\n"
+    "ldr d3, [x6, #0x18]\n"
+    "ldr d4, [x6, #0x20]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr d5, [x6, #0x28]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "ldr d6, [x6, #0x30]\n"
+    "ldr d7, [x6, #0x38]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ldr d8, [x6, #0x40]\n"
+    "usubl v5.8h, v5.8b, v13.8b\n"
+    "ldp x26, x25, [x8, #0x0]\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "ldp x24, x23, [x8, #0x10]\n"
+    "usubl v7.8h, v7.8b, v13.8b\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldp x22, x21, [x8, #0x20]\n"
+    "ldp x20, x19, [x8, #0x30]\n"
+    "ldr d31, [x26, x5]\n"
+    "usubl v31.8h, v31.8b, v12.8b\n"
+    "ldr d30, [x25, x5]\n"
+    "ldr d29, [x24, x5]\n"
+    "usubl v30.8h, v30.8b, v12.8b\n"
+    "ldr d28, [x23, x5]\n"
+    "ldr d27, [x22, x5]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "ldr d26, [x21, x5]\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "ldr d25, [x20, x5]\n"
+    "ldr d24, [x19, x5]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v15.4s, v31.4h, v8.4h\n"
+    "ldr x23, [x8, #0x40]\n"
+    "tst x4, #0x7\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "ldr x22, [x8, #0x48]\n"
+    "smlal v20.4s, v31.4h, v6.4h\n"
+    "ldr x21, [x8, #0x50]\n"
+    "smlal2 v23.4s, v31.8h, v6.8h\n"
+    "ldr x20, [x8, #0x58]\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "ldr x19, [x8, #0x60]\n"
+    "smlal2 v22.4s, v31.8h, v2.8h\n"
+    "ldr x10, [x8, #0x68]\n"
+    "smlal v17.4s, v31.4h, v0.4h\n"
+    "ldr x9, [x8, #0x70]\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x8, #0x78]\n"
+    "smlal v15.4s, v30.4h, v0.4h\n"
+    "ldr x27, [x8, #0x80]\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "ldr x26, [x8, #0x88]\n"
+    "smlal v20.4s, v28.4h, v1.4h\n"
+    "ldr x25, [x8, #0x90]\n"
+    "smlal2 v23.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x5]\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v15.4s, v29.4h, v1.4h\n"
+    "ldr x24, [x8, #0x98]\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "ldr d29, [x23, x5]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v27.4h, v2.4h\n"
+    "ldr x23, [x8, #0xa0]\n"
+    "smlal2 v23.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x5]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v15.4s, v26.4h, v3.4h\n"
+    "ldr x22, [x8, #0xa8]\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x20, x5]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v4.4h\n"
+    "ldr x21, [x8, #0xb0]\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "ldr d25, [x19, x5]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "ldr x20, [x8, #0xb8]\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "ldr x19, [x8, #0xc0]\n"
+    "smlal v20.4s, v24.4h, v0.4h\n"
+    "ldr q21, [x17, #0x0]\n"
+    "smlal2 v23.4s, v24.8h, v0.8h\n"
+    "ldr d24, [x9, x5]\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v4.4h\n"
+    "ldr q30, [x15, #0x0]\n"
+    "smlal2 v23.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x10, x5]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v5.4h\n"
+    "ldr q31, [x17, #0x10]\n"
+    "smlal2 v23.4s, v28.8h, v5.8h\n"
+    "ldr d28, [x27, x5]\n"
+    "add x17, x17, #0x20\n"
+    "smlal v15.4s, v27.4h, v5.4h\n"
+    "ldr q9, [x15, #0x10]\n"
+    "add x15, x15, #0x20\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v20.4s, v27.4h, v3.4h\n"
+    "smlal2 v23.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x28, x5]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v16.4s, v26.4h, v3.4h\n"
+    "smlal2 v22.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x26, x5]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v6.4h\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v22.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x25, x5]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v16.4s, v29.4h, v4.4h\n"
+    "smlal2 v22.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x24, x5]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v7.4h\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v22.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x22, x5]\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v17.4s, v27.4h, v4.4h\n"
+    "smlal2 v18.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x5]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "smlal2 v23.4s, v28.8h, v7.8h\n"
+    "smlal v17.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "smlal v16.4s, v25.4h, v6.4h\n"
+    "smlal2 v22.4s, v25.8h, v6.8h\n"
+    "ldr d25, [x20, x5]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v17.4s, v26.4h, v5.4h\n"
+    "smlal2 v18.4s, v26.8h, v5.8h\n"
+    "ldr d26, [x21, x5]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "smlal2 v23.4s, v29.8h, v8.8h\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "smlal2 v18.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x19, x5]\n"
+    "add x5, x5, #0x8\n"
+    "smlal v16.4s, v27.4h, v7.4h\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal2 v22.4s, v27.8h, v7.8h\n"
+    "smlal v17.4s, v24.4h, v3.4h\n"
+    "smlal v16.4s, v24.4h, v5.4h\n"
+    "smlal2 v18.4s, v24.8h, v3.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+    "smlal2 v22.4s, v24.8h, v5.8h\n"
+    "smlal v17.4s, v26.4h, v7.4h\n"
+    "smlal2 v18.4s, v26.8h, v7.8h\n"
+    "smlal v16.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "smlal v17.4s, v25.4h, v6.4h\n"
+    "smlal2 v18.4s, v25.8h, v6.8h\n"
+    "and v26.16b, v15.16b, v30.16b\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "smlal v17.4s, v29.4h, v8.4h\n"
+    "smlal2 v18.4s, v29.8h, v8.8h\n"
+    "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+    "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+    "sqadd v15.4s, v15.4s, v26.4s\n"
+    "and v8.16b, v10.16b, v9.16b\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "srshl v15.4s, v15.4s, v30.4s\n"
+    "and v4.16b, v20.16b, v30.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "and v2.16b, v23.16b, v9.16b\n"
+    "and v1.16b, v16.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v11.4s\n"
+    "sqadd v10.4s, v10.4s, v8.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+    "sqadd v20.4s, v20.4s, v4.4s\n"
+    "smin v15.4s, v15.4s, v14.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "smax v15.4s, v15.4s, v19.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "add v10.4s, v10.4s, v11.4s\n"
+    "srshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v16.4s, v16.4s, v1.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "smax v10.4s, v10.4s, v19.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "uzp1 v15.16b, v15.16b, v10.16b\n"
+    "smax v20.4s, v20.4s, v19.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x14, x7]\n"
+    "smax v23.4s, v23.4s, v19.4s\n"
+    "srshl v16.4s, v16.4s, v30.4s\n"
+    "and v24.16b, v22.16b, v9.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "uzp1 v20.16b, v20.16b, v23.16b\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str d20, [x13, x7]\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "sqadd v22.4s, v22.4s, v24.4s\n"
+    "and v2.16b, v17.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "smax v16.4s, v16.4s, v19.4s\n"
+    "srshl v22.4s, v22.4s, v9.4s\n"
+    "and v31.16b, v18.16b, v9.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "srshl v17.4s, v17.4s, v30.4s\n"
+    "sqadd v18.4s, v18.4s, v31.4s\n"
+    "smax v22.4s, v22.4s, v19.4s\n"
+    "uzp1 v16.16b, v16.16b, v22.16b\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "srshl v18.4s, v18.4s, v9.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x12, x7]\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "smax v17.4s, v17.4s, v19.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smax v18.4s, v18.4s, v19.4s\n"
+    "uzp1 v17.16b, v17.16b, v18.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "str d17, [x11, x7]\n"
+    "add x7, x7, #0x8\n"
+    "beq 88f\n"
+    "add x6, x6, #0x48\n"
+    "3:"  // Oddments
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x4, #2, 5f\n"
+    "ld1 { v15.4s }, [x19], #0x10\n"
+    "tbz x4, #1, 4f\n"
+    "ld1 { v10.d }[0], [x19], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v10.s }[0], [x19]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x4, #1, 6f\n"
+    "ld1 { v15.d }[0], [x19], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[2], [x19]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[0], [x19]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v20.16b, v15.16b\n"
+    "ldr d0, [x6, #0x0]\n"
+    "mov v23.16b, v10.16b\n"
+    "ldr d1, [x6, #0x8]\n"
+    "mov v16.16b, v15.16b\n"
+    "ldr d2, [x6, #0x10]\n"
+    "mov v22.16b, v10.16b\n"
+    "ldr d3, [x6, #0x18]\n"
+    "mov v17.16b, v15.16b\n"
+    "ldr d4, [x6, #0x20]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "mov v18.16b, v10.16b\n"
+    "ldr d5, [x6, #0x28]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "ldr d6, [x6, #0x30]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldr d7, [x6, #0x38]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "ldr d8, [x6, #0x40]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ldp x26, x25, [x8, #0x0]\n"
+    "usubl v5.8h, v5.8b, v13.8b\n"
+    "ldp x24, x23, [x8, #0x10]\n"
+    "usubl v6.8h, v6.8b, v13.8b\n"
+    "usubl v7.8h, v7.8b, v13.8b\n"
+    "ldp x22, x21, [x8, #0x20]\n"
+    "usubl v8.8h, v8.8b, v13.8b\n"
+    "ldp x20, x19, [x8, #0x30]\n"
+    "add x26, x26, x5\n"
+    "add x25, x25, x5\n"
+    "add x24, x24, x5\n"
+    "add x23, x23, x5\n"
+    "add x22, x22, x5\n"
+    "add x21, x21, x5\n"
+    "add x20, x20, x5\n"
+    "add x19, x19, x5\n"
+    "tbz x4, #2, 9f\n"
+    "ld1 { v31.s }[0], [x26], #0x4\n"
+    "ld1 { v30.s }[0], [x25], #0x4\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "ld1 { v28.s }[0], [x23], #0x4\n"
+    "ld1 { v27.s }[0], [x22], #0x4\n"
+    "ld1 { v26.s }[0], [x21], #0x4\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "ld1 { v24.s }[0], [x19], #0x4\n"
+    "tbz x4, #1, 8f\n"
+    "ld1 { v31.h }[2], [x26], #0x2\n"
+    "ld1 { v30.h }[2], [x25], #0x2\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "ld1 { v28.h }[2], [x23], #0x2\n"
+    "ld1 { v27.h }[2], [x22], #0x2\n"
+    "ld1 { v26.h }[2], [x21], #0x2\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "ld1 { v24.h }[2], [x19], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[6], [x26]\n"
+    "ld1 { v30.b }[6], [x25]\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "ld1 { v28.b }[6], [x23]\n"
+    "ld1 { v27.b }[6], [x22]\n"
+    "ld1 { v26.b }[6], [x21]\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "ld1 { v24.b }[6], [x19]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[4], [x26]\n"
+    "ld1 { v30.b }[4], [x25]\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "ld1 { v28.b }[4], [x23]\n"
+    "ld1 { v27.b }[4], [x22]\n"
+    "ld1 { v26.b }[4], [x21]\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "ld1 { v24.b }[4], [x19]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x4, #1, 10f\n"
+    "ld1 { v31.h }[0], [x26], #0x2\n"
+    "ld1 { v30.h }[0], [x25], #0x2\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "ld1 { v28.h }[0], [x23], #0x2\n"
+    "ld1 { v27.h }[0], [x22], #0x2\n"
+    "ld1 { v26.h }[0], [x21], #0x2\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "ld1 { v24.h }[0], [x19], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[2], [x26]\n"
+    "ld1 { v30.b }[2], [x25]\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "ld1 { v28.b }[2], [x23]\n"
+    "ld1 { v27.b }[2], [x22]\n"
+    "ld1 { v26.b }[2], [x21]\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "ld1 { v24.b }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[0], [x26]\n"
+    "ld1 { v30.b }[0], [x25]\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "ld1 { v28.b }[0], [x23]\n"
+    "ld1 { v27.b }[0], [x22]\n"
+    "ld1 { v26.b }[0], [x21]\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "ld1 { v24.b }[0], [x19]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ldr x23, [x8, #0x40]\n"
+    "usubl v31.8h, v31.8b, v12.8b\n"
+    "smlal v15.4s, v31.4h, v8.4h\n"
+    "usubl v30.8h, v30.8b, v12.8b\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v31.4h, v6.4h\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "smlal2 v23.4s, v31.8h, v6.8h\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal2 v22.4s, v31.8h, v2.8h\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v17.4s, v31.4h, v0.4h\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "add x23, x23, x5\n"
+    "smlal v15.4s, v30.4h, v0.4h\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "smlal v20.4s, v28.4h, v1.4h\n"
+    "smlal2 v23.4s, v28.8h, v1.8h\n"
+    "smlal v15.4s, v29.4h, v1.4h\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "smlal v20.4s, v27.4h, v2.4h\n"
+    "smlal2 v23.4s, v27.8h, v2.8h\n"
+    "smlal v15.4s, v26.4h, v3.4h\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "smlal v20.4s, v24.4h, v0.4h\n"
+    "smlal2 v23.4s, v24.8h, v0.8h\n"
+    "smlal v15.4s, v25.4h, v4.4h\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "tbz x4, #2, 13f\n"
+    "ld1 { v29.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 12f\n"
+    "ld1 { v29.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v29.b }[6], [x23]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v29.b }[4], [x23]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x4, #1, 14f\n"
+    "ld1 { v29.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v29.b }[2], [x23]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v29.b }[0], [x23]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "ldr x22, [x8, #0x48]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v4.4h\n"
+    "smlal2 v23.4s, v29.8h, v4.8h\n"
+    "add x22, x22, x5\n"
+    "tbz x4, #2, 17f\n"
+    "ld1 { v28.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 16f\n"
+    "ld1 { v28.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v28.b }[6], [x22]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v28.b }[4], [x22]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x4, #1, 18f\n"
+    "ld1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v28.b }[2], [x22]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v28.b }[0], [x22]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "ldr x21, [x8, #0x50]\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v5.4h\n"
+    "smlal2 v23.4s, v28.8h, v5.8h\n"
+    "add x21, x21, x5\n"
+    "tbz x4, #2, 21f\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 20f\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x4, #1, 22f\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "ldr x20, [x8, #0x58]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v15.4s, v27.4h, v5.4h\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "add x20, x20, x5\n"
+    "smlal v20.4s, v27.4h, v3.4h\n"
+    "smlal2 v23.4s, v27.8h, v3.8h\n"
+    "tbz x4, #2, 25f\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 24f\n"
+    "ld1 { v26.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v26.b }[6], [x20]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v26.b }[4], [x20]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x4, #1, 26f\n"
+    "ld1 { v26.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v26.b }[2], [x20]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v26.b }[0], [x20]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "ldr x19, [x8, #0x60]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v16.4s, v26.4h, v3.4h\n"
+    "smlal2 v22.4s, v26.8h, v3.8h\n"
+    "add x19, x19, x5\n"
+    "tbz x4, #2, 29f\n"
+    "ld1 { v25.s }[0], [x19], #0x4\n"
+    "tbz x4, #1, 28f\n"
+    "ld1 { v25.h }[2], [x19], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v25.b }[6], [x19]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v25.b }[4], [x19]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x4, #1, 30f\n"
+    "ld1 { v25.h }[0], [x19], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v25.b }[2], [x19]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v25.b }[0], [x19]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "ldr x10, [x8, #0x68]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v15.4s, v25.4h, v6.4h\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "add x10, x10, x5\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v22.4s, v25.8h, v0.8h\n"
+    "tbz x4, #2, 33f\n"
+    "ld1 { v29.s }[0], [x10], #0x4\n"
+    "tbz x4, #1, 32f\n"
+    "ld1 { v29.h }[2], [x10], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v29.b }[6], [x10]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v29.b }[4], [x10]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x4, #1, 34f\n"
+    "ld1 { v29.h }[0], [x10], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v29.b }[2], [x10]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v29.b }[0], [x10]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr x9, [x8, #0x70]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v16.4s, v29.4h, v4.4h\n"
+    "smlal2 v22.4s, v29.8h, v4.8h\n"
+    "add x9, x9, x5\n"
+    "tbz x4, #2, 37f\n"
+    "ld1 { v24.s }[0], [x9], #0x4\n"
+    "tbz x4, #1, 36f\n"
+    "ld1 { v24.h }[2], [x9], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v24.b }[6], [x9]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v24.b }[4], [x9]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x4, #1, 38f\n"
+    "ld1 { v24.h }[0], [x9], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v24.b }[2], [x9]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v24.b }[0], [x9]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr x28, [x8, #0x78]\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v15.4s, v24.4h, v7.4h\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "add x28, x28, x5\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v22.4s, v24.8h, v1.8h\n"
+    "tbz x4, #2, 41f\n"
+    "ld1 { v27.s }[0], [x28], #0x4\n"
+    "tbz x4, #1, 40f\n"
+    "ld1 { v27.h }[2], [x28], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v27.b }[6], [x28]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v27.b }[4], [x28]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x4, #1, 42f\n"
+    "ld1 { v27.h }[0], [x28], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v27.b }[2], [x28]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v27.b }[0], [x28]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr x27, [x8, #0x80]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v17.4s, v27.4h, v4.4h\n"
+    "smlal2 v18.4s, v27.8h, v4.8h\n"
+    "add x27, x27, x5\n"
+    "tbz x4, #2, 45f\n"
+    "ld1 { v28.s }[0], [x27], #0x4\n"
+    "tbz x4, #1, 44f\n"
+    "ld1 { v28.h }[2], [x27], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v28.b }[6], [x27]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v28.b }[4], [x27]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x4, #1, 46f\n"
+    "ld1 { v28.h }[0], [x27], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v28.b }[2], [x27]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v28.b }[0], [x27]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr x26, [x8, #0x88]\n"
+    "usubl v28.8h, v28.8b, v12.8b\n"
+    "smlal v20.4s, v28.4h, v7.4h\n"
+    "smlal2 v23.4s, v28.8h, v7.8h\n"
+    "add x26, x26, x5\n"
+    "smlal v17.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "tbz x4, #2, 49f\n"
+    "ld1 { v26.s }[0], [x26], #0x4\n"
+    "tbz x4, #1, 48f\n"
+    "ld1 { v26.h }[2], [x26], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v26.b }[6], [x26]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v26.b }[4], [x26]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x4, #1, 50f\n"
+    "ld1 { v26.h }[0], [x26], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v26.b }[2], [x26]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v26.b }[0], [x26]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr x25, [x8, #0x90]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v17.4s, v26.4h, v5.4h\n"
+    "smlal2 v18.4s, v26.8h, v5.8h\n"
+    "add x25, x25, x5\n"
+    "tbz x4, #2, 53f\n"
+    "ld1 { v25.s }[0], [x25], #0x4\n"
+    "tbz x4, #1, 52f\n"
+    "ld1 { v25.h }[2], [x25], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v25.b }[6], [x25]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v25.b }[4], [x25]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x4, #1, 54f\n"
+    "ld1 { v25.h }[0], [x25], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v25.b }[2], [x25]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v25.b }[0], [x25]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "ldr x24, [x8, #0x98]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v16.4s, v25.4h, v6.4h\n"
+    "smlal2 v22.4s, v25.8h, v6.8h\n"
+    "add x24, x24, x5\n"
+    "tbz x4, #2, 57f\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "tbz x4, #1, 56f\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x4, #1, 58f\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr x23, [x8, #0xa0]\n"
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v20.4s, v29.4h, v8.4h\n"
+    "smlal2 v23.4s, v29.8h, v8.8h\n"
+    "add x23, x23, x5\n"
+    "smlal v17.4s, v29.4h, v2.4h\n"
+    "smlal2 v18.4s, v29.8h, v2.8h\n"
+    "tbz x4, #2, 61f\n"
+    "ld1 { v27.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 60f\n"
+    "ld1 { v27.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v27.b }[6], [x23]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v27.b }[4], [x23]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x4, #1, 62f\n"
+    "ld1 { v27.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v27.b }[2], [x23]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v27.b }[0], [x23]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr x22, [x8, #0xa8]\n"
+    "usubl v27.8h, v27.8b, v12.8b\n"
+    "smlal v16.4s, v27.4h, v7.4h\n"
+    "smlal2 v22.4s, v27.8h, v7.8h\n"
+    "add x22, x22, x5\n"
+    "tbz x4, #2, 65f\n"
+    "ld1 { v24.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 64f\n"
+    "ld1 { v24.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v24.b }[6], [x22]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v24.b }[4], [x22]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x4, #1, 66f\n"
+    "ld1 { v24.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v24.b }[2], [x22]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v24.b }[0], [x22]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr x21, [x8, #0xb0]\n"
+    "usubl v24.8h, v24.8b, v12.8b\n"
+    "smlal v16.4s, v24.4h, v5.4h\n"
+    "smlal2 v22.4s, v24.8h, v5.8h\n"
+    "add x21, x21, x5\n"
+    "smlal v17.4s, v24.4h, v3.4h\n"
+    "smlal2 v18.4s, v24.8h, v3.8h\n"
+    "tbz x4, #2, 69f\n"
+    "ld1 { v26.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 68f\n"
+    "ld1 { v26.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v26.b }[6], [x21]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v26.b }[4], [x21]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x4, #1, 70f\n"
+    "ld1 { v26.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v26.b }[2], [x21]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v26.b }[0], [x21]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr x20, [x8, #0xb8]\n"
+    "usubl v26.8h, v26.8b, v12.8b\n"
+    "smlal v17.4s, v26.4h, v7.4h\n"
+    "smlal2 v18.4s, v26.8h, v7.8h\n"
+    "add x20, x20, x5\n"
+    "tbz x4, #2, 73f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 72f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x4, #1, 74f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr x19, [x8, #0xc0]\n"
+    "usubl v25.8h, v25.8b, v12.8b\n"
+    "smlal v16.4s, v25.4h, v8.4h\n"
+    "smlal2 v22.4s, v25.8h, v8.8h\n"
+    "add x19, x19, x5\n"
+    "smlal v17.4s, v25.4h, v6.4h\n"
+    "smlal2 v18.4s, v25.8h, v6.8h\n"
+    "tbz x4, #2, 77f\n"
+    "ld1 { v29.s }[0], [x19], #0x4\n"
+    "tbz x4, #1, 76f\n"
+    "ld1 { v29.h }[2], [x19], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v29.b }[6], [x19]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v29.b }[4], [x19]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x4, #1, 78f\n"
+    "ld1 { v29.h }[0], [x19], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v29.b }[2], [x19]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v29.b }[0], [x19]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "usubl v29.8h, v29.8b, v12.8b\n"
+    "smlal v17.4s, v29.4h, v8.4h\n"
+    "smlal2 v18.4s, v29.8h, v8.8h\n"
+    "tbz x4, #2, 81f\n"
+    "ld1 { v21.4s }, [x17], #0x10\n"
+    "ld1 { v30.4s }, [x15], #0x10\n"
+    "tbz x4, #1, 80f\n"
+    "ld1 { v31.d }[0], [x17], #0x8\n"
+    "ld1 { v9.d }[0], [x15], #0x8\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v31.s }[2], [x17]\n"
+    "ld1 { v9.s }[2], [x15]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v31.s }[0], [x17]\n"
+    "ld1 { v9.s }[0], [x15]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x4, #1, 82f\n"
+    "ld1 { v21.d }[0], [x17], #0x8\n"
+    "ld1 { v30.d }[0], [x15], #0x8\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v21.s }[2], [x17]\n"
+    "ld1 { v30.s }[2], [x15]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v21.s }[0], [x17]\n"
+    "ld1 { v30.s }[0], [x15]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v15.4s, v15.4s, v21.4s\n"
+    "add x14, x14, x7\n"
+    "sqrdmulh v10.4s, v10.4s, v31.4s\n"
+    "add x13, x13, x7\n"
+    "sqrdmulh v20.4s, v20.4s, v21.4s\n"
+    "add x12, x12, x7\n"
+    "sqrdmulh v23.4s, v23.4s, v31.4s\n"
+    "add x11, x11, x7\n"
+    "sqrdmulh v16.4s, v16.4s, v21.4s\n"
+    "and v26.16b, v15.16b, v30.16b\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "and v8.16b, v10.16b, v9.16b\n"
+    "and v4.16b, v20.16b, v30.16b\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "and v2.16b, v23.16b, v9.16b\n"
+    "and v1.16b, v16.16b, v30.16b\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v26.4s\n"
+    "sqrdmulh v17.4s, v17.4s, v21.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "sqadd v10.4s, v10.4s, v8.4s\n"
+    "sqadd v20.4s, v20.4s, v4.4s\n"
+    "srshl v15.4s, v15.4s, v30.4s\n"
+    "sqadd v23.4s, v23.4s, v2.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "srshl v20.4s, v20.4s, v30.4s\n"
+    "add v15.4s, v15.4s, v11.4s\n"
+    "srshl v23.4s, v23.4s, v9.4s\n"
+    "add v10.4s, v10.4s, v11.4s\n"
+    "smin v15.4s, v15.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v11.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "smax v15.4s, v15.4s, v19.4s\n"
+    "smin v20.4s, v20.4s, v14.4s\n"
+    "smax v10.4s, v10.4s, v19.4s\n"
+    "add v23.4s, v23.4s, v11.4s\n"
+    "smax v20.4s, v20.4s, v19.4s\n"
+    "uzp1 v15.16b, v15.16b, v10.16b\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "sqadd v16.4s, v16.4s, v1.4s\n"
+    "smax v23.4s, v23.4s, v19.4s\n"
+    "and v24.16b, v22.16b, v9.16b\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "uzp1 v20.16b, v20.16b, v23.16b\n"
+    "srshl v16.4s, v16.4s, v30.4s\n"
+    "and v2.16b, v17.16b, v30.16b\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "add v16.4s, v16.4s, v11.4s\n"
+    "sqadd v22.4s, v22.4s, v24.4s\n"
+    "and v31.16b, v18.16b, v9.16b\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "smin v16.4s, v16.4s, v14.4s\n"
+    "srshl v22.4s, v22.4s, v9.4s\n"
+    "sqadd v17.4s, v17.4s, v2.4s\n"
+    "smax v16.4s, v16.4s, v19.4s\n"
+    "add v22.4s, v22.4s, v11.4s\n"
+    "srshl v17.4s, v17.4s, v30.4s\n"
+    "sqadd v18.4s, v18.4s, v31.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "add v17.4s, v17.4s, v11.4s\n"
+    "srshl v18.4s, v18.4s, v9.4s\n"
+    "smax v22.4s, v22.4s, v19.4s\n"
+    "smin v17.4s, v17.4s, v14.4s\n"
+    "uzp1 v16.16b, v16.16b, v22.16b\n"
+    "add v18.4s, v18.4s, v11.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "smax v17.4s, v17.4s, v19.4s\n"
+    "smin v18.4s, v18.4s, v14.4s\n"
+    "smax v18.4s, v18.4s, v19.4s\n"
+    "uzp1 v17.16b, v17.16b, v18.16b\n"
+    "uzp1 v17.16b, v17.16b, v17.16b\n"
+    "tbz x4, #2, 85f\n"
+    "st1 { v15.s }[0], [x14], #0x4\n"
+    "st1 { v20.s }[0], [x13], #0x4\n"
+    "st1 { v16.s }[0], [x12], #0x4\n"
+    "st1 { v17.s }[0], [x11], #0x4\n"
+    "tbz x4, #1, 84f\n"
+    "st1 { v15.h }[2], [x14], #0x2\n"
+    "st1 { v20.h }[2], [x13], #0x2\n"
+    "st1 { v16.h }[2], [x12], #0x2\n"
+    "st1 { v17.h }[2], [x11], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "st1 { v15.b }[6], [x14], #0x1\n"
+    "st1 { v20.b }[6], [x13], #0x1\n"
+    "st1 { v16.b }[6], [x12], #0x1\n"
+    "st1 { v17.b }[6], [x11], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "st1 { v15.b }[4], [x14], #0x1\n"
+    "st1 { v20.b }[4], [x13], #0x1\n"
+    "st1 { v16.b }[4], [x12], #0x1\n"
+    "st1 { v17.b }[4], [x11], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x4, #1, 86f\n"
+    "st1 { v15.h }[0], [x14], #0x2\n"
+    "st1 { v20.h }[0], [x13], #0x2\n"
+    "st1 { v16.h }[0], [x12], #0x2\n"
+    "st1 { v17.h }[0], [x11], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "st1 { v15.b }[2], [x14], #0x1\n"
+    "st1 { v20.b }[2], [x13], #0x1\n"
+    "st1 { v16.b }[2], [x12], #0x1\n"
+    "st1 { v17.b }[2], [x11], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "st1 { v15.b }[0], [x14], #0x1\n"
+    "st1 { v20.b }[0], [x13], #0x1\n"
+    "st1 { v16.b }[0], [x12], #0x1\n"
+    "st1 { v17.b }[0], [x11], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+
+    "88:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..73de965
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_5x5_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_5x5_mla::get_packed_size;
+
+  kern_type kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+  a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..699cc6c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,2213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const uint8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const uint8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x10, #0x0\n"
+    "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x1, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x25, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x19, x4, #0x3\n"
+    "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v7.16b }, [x13]\n"
+    "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v13.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v19.4s }, [x8]\n"
+    "add x8, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "ld1r { v12.4s }, [x8]\n"
+    "ldp x17, x16, [x21, #0x0]\n"
+    "ldp x6, x8, [x21, #0x10]\n"
+    "cbz x19, 3f\n"
+    "subs x19, x19, #0x1\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x12, #0x0]\n"
+    "mov v18.16b, v15.16b\n"
+    "ldr q20, [x12, #0x10]\n"
+    "add x12, x12, #0x20\n"
+    "mov v11.16b, v15.16b\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v10.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "mov v5.16b, v20.16b\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v8.16b, v20.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "mov v9.16b, v20.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "ldr d4, [x3, #0x20]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "ldr d31, [x28, x10]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "ldr d30, [x27, x10]\n"
+    "ldr d29, [x26, x10]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "ldr d28, [x13, x10]\n"
+    "ldr d27, [x24, x10]\n"
+    "usubl v29.8h, v29.8b, v7.8b\n"
+    "ldr d23, [x23, x10]\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "ldr d25, [x22, x10]\n"
+    "ldr d24, [x21, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "ldr d26, [x20, x10]\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "ldr d22, [x0, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "usubl v22.8h, v22.8b, v7.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "ldr x20, [x25, #0x50]\n"
+    "subs x19, x19, #0x1\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x25, #0x58]\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "ldr x0, [x25, #0x60]\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "ldr d31, [x20, x10]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v11.4s, v29.4h, v0.4h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "smlal2 v8.4s, v29.8h, v0.8h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "smlal v10.4s, v28.4h, v0.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "ldr d0, [x3, #0x28]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v1.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "ldr x21, [x25, #0x98]\n"
+    "smlal2 v8.4s, v28.8h, v1.8h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "smlal v10.4s, v23.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "ldr d1, [x3, #0x30]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x0, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "smlal v11.4s, v23.4h, v2.4h\n"
+    "ldr x9, [x25, #0xc8]\n"
+    "smlal2 v8.4s, v23.8h, v2.8h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "smlal v10.4s, v31.4h, v2.4h\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "smlal2 v9.4s, v31.8h, v2.8h\n"
+    "ldr d2, [x3, #0x38]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "ldr q6, [x2, #0x0]\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x7, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "ldr q21, [x5, #0x0]\n"
+    "smlal v11.4s, v31.4h, v3.4h\n"
+    "ldr q17, [x2, #0x10]\n"
+    "add x2, x2, #0x20\n"
+    "smlal2 v8.4s, v31.8h, v3.8h\n"
+    "ldr q14, [x5, #0x10]\n"
+    "add x5, x5, #0x20\n"
+    "smlal v10.4s, v30.4h, v3.4h\n"
+    "smlal2 v9.4s, v30.8h, v3.8h\n"
+    "ldr d3, [x3, #0x40]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x26, x10]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v11.4s, v30.4h, v4.4h\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "smlal2 v8.4s, v30.8h, v4.8h\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0x48]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v20.4s, v29.8h, v0.8h\n"
+    "smlal v18.4s, v28.4h, v0.4h\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "smlal v11.4s, v22.4h, v0.4h\n"
+    "smlal2 v8.4s, v22.8h, v0.8h\n"
+    "smlal v10.4s, v25.4h, v0.4h\n"
+    "smlal2 v9.4s, v25.8h, v0.8h\n"
+    "ldr d0, [x3, #0x50]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x10]\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v1.4h\n"
+    "ldr x23, [x25, #0xf8]\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "smlal v11.4s, v25.4h, v1.4h\n"
+    "smlal2 v8.4s, v25.8h, v1.8h\n"
+    "smlal v10.4s, v24.4h, v1.4h\n"
+    "smlal2 v9.4s, v24.8h, v1.8h\n"
+    "ldr d1, [x3, #0x58]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v20.4s, v23.8h, v2.8h\n"
+    "ldr d23, [x20, x10]\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v18.4s, v31.4h, v2.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "smlal v11.4s, v24.4h, v2.4h\n"
+    "smlal2 v8.4s, v24.8h, v2.8h\n"
+    "smlal v10.4s, v27.4h, v2.4h\n"
+    "smlal2 v9.4s, v27.8h, v2.8h\n"
+    "ldr d2, [x3, #0x60]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v20.4s, v31.8h, v3.8h\n"
+    "ldr d31, [x13, x10]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v3.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "smlal v11.4s, v27.4h, v3.4h\n"
+    "smlal2 v8.4s, v27.8h, v3.8h\n"
+    "smlal v10.4s, v23.4h, v3.4h\n"
+    "smlal2 v9.4s, v23.8h, v3.8h\n"
+    "ldr d3, [x3, #0x68]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x10]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ldr d26, [x14, x10]\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v11.4s, v23.4h, v4.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "smlal v10.4s, v28.4h, v4.4h\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "ldr d4, [x3, #0x70]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v20.4s, v22.8h, v0.8h\n"
+    "ldr d22, [x0, x10]\n"
+    "usubl v22.8h, v22.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v0.4h\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "smlal v11.4s, v31.4h, v0.4h\n"
+    "smlal2 v8.4s, v31.8h, v0.8h\n"
+    "smlal v10.4s, v30.4h, v0.4h\n"
+    "smlal2 v9.4s, v30.8h, v0.8h\n"
+    "ldr d0, [x3, #0x78]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v20.4s, v25.8h, v1.8h\n"
+    "ldr d25, [x11, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v1.4h\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "smlal v11.4s, v30.4h, v1.4h\n"
+    "smlal2 v8.4s, v30.8h, v1.8h\n"
+    "smlal v10.4s, v26.4h, v1.4h\n"
+    "smlal2 v9.4s, v26.8h, v1.8h\n"
+    "ldr d1, [x3, #0x80]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v20.4s, v24.8h, v2.8h\n"
+    "ldr d24, [x24, x10]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v2.4h\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "smlal v11.4s, v26.4h, v2.4h\n"
+    "smlal2 v8.4s, v26.8h, v2.8h\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "ldr d2, [x3, #0x88]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x15, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v3.4h\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "ldr d3, [x3, #0x90]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v20.4s, v23.8h, v4.8h\n"
+    "ldr d23, [x9, x10]\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v18.4s, v28.4h, v4.4h\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "ldr d28, [x12, x10]\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "smlal v10.4s, v22.4h, v4.4h\n"
+    "smlal2 v9.4s, v22.8h, v4.8h\n"
+    "ldr d4, [x3, #0x98]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x27, x10]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "smlal v11.4s, v27.4h, v0.4h\n"
+    "smlal2 v8.4s, v27.8h, v0.8h\n"
+    "smlal v10.4s, v23.4h, v0.4h\n"
+    "smlal2 v9.4s, v23.8h, v0.8h\n"
+    "ldr d0, [x3, #0xa0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v26.4h, v1.4h\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "smlal v11.4s, v23.4h, v1.4h\n"
+    "smlal2 v8.4s, v23.8h, v1.8h\n"
+    "smlal v10.4s, v31.4h, v1.4h\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "ldr d1, [x3, #0xa8]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v20.4s, v26.8h, v2.8h\n"
+    "ldr d26, [x7, x10]\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v8.4s, v31.8h, v2.8h\n"
+    "smlal v10.4s, v30.4h, v2.4h\n"
+    "smlal2 v9.4s, v30.8h, v2.8h\n"
+    "ldr d2, [x3, #0xb0]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x26, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v11.4s, v30.4h, v3.4h\n"
+    "smlal2 v8.4s, v30.8h, v3.8h\n"
+    "smlal v10.4s, v28.4h, v3.4h\n"
+    "smlal2 v9.4s, v28.8h, v3.8h\n"
+    "ldr d3, [x3, #0xb8]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x23, x10]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v22.4h, v4.4h\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "smlal v11.4s, v28.4h, v4.4h\n"
+    "smlal2 v8.4s, v28.8h, v4.8h\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0xc0]\n"
+    "add x3, x3, #0xc8\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v20.4s, v27.8h, v0.8h\n"
+    "ldr d27, [x22, x10]\n"
+    "smlal v18.4s, v23.4h, v0.4h\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v8.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x20, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v0.4h\n"
+    "smlal2 v9.4s, v24.8h, v0.8h\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v20.4s, v23.8h, v1.8h\n"
+    "smlal v18.4s, v31.4h, v1.4h\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v8.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x13, x10]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v1.4h\n"
+    "smlal2 v9.4s, v27.8h, v1.8h\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v20.4s, v31.8h, v2.8h\n"
+    "smlal v18.4s, v30.4h, v2.4h\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "smlal v11.4s, v27.4h, v2.4h\n"
+    "smlal2 v8.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x10]\n"
+    "add x10, x10, #0x8\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v20.4s, v30.8h, v3.8h\n"
+    "smlal v18.4s, v28.4h, v3.4h\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v20.4s, v28.8h, v4.8h\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "smlal v10.4s, v27.4h, v4.4h\n"
+    "smlal2 v9.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+    "and v1.16b, v15.16b, v21.16b\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "and v29.16b, v20.16b, v14.16b\n"
+    "and v3.16b, v18.16b, v21.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v2.16b, v5.16b, v14.16b\n"
+    "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v1.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+    "and v0.16b, v11.16b, v21.16b\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v15.4s, v15.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v29.4s\n"
+    "sqadd v18.4s, v18.4s, v3.4s\n"
+    "sqadd v5.4s, v5.4s, v2.4s\n"
+    "and v27.16b, v8.16b, v14.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v19.4s\n"
+    "srshl v20.4s, v20.4s, v14.4s\n"
+    "srshl v18.4s, v18.4s, v21.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "smin v15.4s, v15.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v19.4s\n"
+    "add v18.4s, v18.4s, v19.4s\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "add v5.4s, v5.4s, v19.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v5.4s, v5.4s, v12.4s\n"
+    "uzp1 v15.16b, v15.16b, v20.16b\n"
+    "sqadd v11.4s, v11.4s, v0.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x17, x1]\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "sqadd v8.4s, v8.4s, v27.4s\n"
+    "srshl v11.4s, v11.4s, v21.4s\n"
+    "and v30.16b, v10.16b, v21.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "uzp1 v18.16b, v18.16b, v5.16b\n"
+    "add v11.4s, v11.4s, v19.4s\n"
+    "srshl v8.4s, v8.4s, v14.4s\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str d18, [x16, x1]\n"
+    "smin v11.4s, v11.4s, v12.4s\n"
+    "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+    "add v8.4s, v8.4s, v19.4s\n"
+    "sqadd v10.4s, v10.4s, v30.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "smin v8.4s, v8.4s, v12.4s\n"
+    "and v6.16b, v9.16b, v14.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "srshl v10.4s, v10.4s, v21.4s\n"
+    "uzp1 v11.16b, v11.16b, v8.16b\n"
+    "add v10.4s, v10.4s, v19.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str d11, [x6, x1]\n"
+    "smin v10.4s, v10.4s, v12.4s\n"
+    "sqadd v9.4s, v9.4s, v6.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "srshl v9.4s, v9.4s, v14.4s\n"
+    "add v9.4s, v9.4s, v19.4s\n"
+    "smin v9.4s, v9.4s, v12.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "uzp1 v10.16b, v10.16b, v9.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d10, [x8, x1]\n"
+    "add x1, x1, #0x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x12, #0x0]\n"
+    "mov v18.16b, v15.16b\n"
+    "ldr q20, [x12, #0x10]\n"
+    "add x12, x12, #0x20\n"
+    "mov v11.16b, v15.16b\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v10.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "mov v5.16b, v20.16b\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v8.16b, v20.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "mov v9.16b, v20.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "ldr d4, [x3, #0x20]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "ldr d31, [x28, x10]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "ldr d30, [x27, x10]\n"
+    "ldr d29, [x26, x10]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "ldr d28, [x13, x10]\n"
+    "ldr d27, [x24, x10]\n"
+    "usubl v29.8h, v29.8b, v7.8b\n"
+    "ldr d23, [x23, x10]\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "ldr d25, [x22, x10]\n"
+    "ldr d24, [x21, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "ldr d26, [x20, x10]\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "ldr d22, [x0, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "usubl v22.8h, v22.8b, v7.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "ldr x20, [x25, #0x50]\n"
+    "tst x4, #0x7\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x25, #0x58]\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "ldr x0, [x25, #0x60]\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "ldr d31, [x20, x10]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v11.4s, v29.4h, v0.4h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "smlal2 v8.4s, v29.8h, v0.8h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "smlal v10.4s, v28.4h, v0.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "ldr d0, [x3, #0x28]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v1.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "ldr x21, [x25, #0x98]\n"
+    "smlal2 v8.4s, v28.8h, v1.8h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "smlal v10.4s, v23.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "ldr d1, [x3, #0x30]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x0, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "smlal v11.4s, v23.4h, v2.4h\n"
+    "ldr x9, [x25, #0xc8]\n"
+    "smlal2 v8.4s, v23.8h, v2.8h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "smlal v10.4s, v31.4h, v2.4h\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "smlal2 v9.4s, v31.8h, v2.8h\n"
+    "ldr d2, [x3, #0x38]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x7, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "ldr q6, [x2, #0x0]\n"
+    "smlal v11.4s, v31.4h, v3.4h\n"
+    "ldr q21, [x5, #0x0]\n"
+    "smlal2 v8.4s, v31.8h, v3.8h\n"
+    "ldr q17, [x2, #0x10]\n"
+    "add x2, x2, #0x20\n"
+    "smlal v10.4s, v30.4h, v3.4h\n"
+    "ldr q14, [x5, #0x10]\n"
+    "add x5, x5, #0x20\n"
+    "smlal2 v9.4s, v30.8h, v3.8h\n"
+    "ldr d3, [x3, #0x40]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x26, x10]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v11.4s, v30.4h, v4.4h\n"
+    "ldr x23, [x25, #0xf8]\n"
+    "smlal2 v8.4s, v30.8h, v4.8h\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0x48]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v20.4s, v29.8h, v0.8h\n"
+    "smlal v18.4s, v28.4h, v0.4h\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "smlal v11.4s, v22.4h, v0.4h\n"
+    "smlal2 v8.4s, v22.8h, v0.8h\n"
+    "smlal v10.4s, v25.4h, v0.4h\n"
+    "smlal2 v9.4s, v25.8h, v0.8h\n"
+    "ldr d0, [x3, #0x50]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x10]\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v1.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "smlal v11.4s, v25.4h, v1.4h\n"
+    "smlal2 v8.4s, v25.8h, v1.8h\n"
+    "smlal v10.4s, v24.4h, v1.4h\n"
+    "smlal2 v9.4s, v24.8h, v1.8h\n"
+    "ldr d1, [x3, #0x58]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v20.4s, v23.8h, v2.8h\n"
+    "ldr d23, [x20, x10]\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v18.4s, v31.4h, v2.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "smlal v11.4s, v24.4h, v2.4h\n"
+    "smlal2 v8.4s, v24.8h, v2.8h\n"
+    "smlal v10.4s, v27.4h, v2.4h\n"
+    "smlal2 v9.4s, v27.8h, v2.8h\n"
+    "ldr d2, [x3, #0x60]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v20.4s, v31.8h, v3.8h\n"
+    "ldr d31, [x13, x10]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v3.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "smlal v11.4s, v27.4h, v3.4h\n"
+    "smlal2 v8.4s, v27.8h, v3.8h\n"
+    "smlal v10.4s, v23.4h, v3.4h\n"
+    "smlal2 v9.4s, v23.8h, v3.8h\n"
+    "ldr d3, [x3, #0x68]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x10]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ldr d26, [x14, x10]\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v11.4s, v23.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "smlal v10.4s, v28.4h, v4.4h\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "ldr d4, [x3, #0x70]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v20.4s, v22.8h, v0.8h\n"
+    "ldr d22, [x0, x10]\n"
+    "usubl v22.8h, v22.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v0.4h\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "smlal v11.4s, v31.4h, v0.4h\n"
+    "smlal2 v8.4s, v31.8h, v0.8h\n"
+    "smlal v10.4s, v30.4h, v0.4h\n"
+    "smlal2 v9.4s, v30.8h, v0.8h\n"
+    "ldr d0, [x3, #0x78]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v20.4s, v25.8h, v1.8h\n"
+    "ldr d25, [x11, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v1.4h\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "smlal v11.4s, v30.4h, v1.4h\n"
+    "smlal2 v8.4s, v30.8h, v1.8h\n"
+    "smlal v10.4s, v26.4h, v1.4h\n"
+    "smlal2 v9.4s, v26.8h, v1.8h\n"
+    "ldr d1, [x3, #0x80]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v20.4s, v24.8h, v2.8h\n"
+    "ldr d24, [x24, x10]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v27.4h, v2.4h\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "smlal v11.4s, v26.4h, v2.4h\n"
+    "smlal2 v8.4s, v26.8h, v2.8h\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "ldr d2, [x3, #0x88]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x15, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v3.4h\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "ldr d3, [x3, #0x90]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v20.4s, v23.8h, v4.8h\n"
+    "ldr d23, [x9, x10]\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v18.4s, v28.4h, v4.4h\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "ldr d28, [x12, x10]\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "smlal v10.4s, v22.4h, v4.4h\n"
+    "smlal2 v9.4s, v22.8h, v4.8h\n"
+    "ldr d4, [x3, #0x98]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x27, x10]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "smlal v11.4s, v27.4h, v0.4h\n"
+    "smlal2 v8.4s, v27.8h, v0.8h\n"
+    "smlal v10.4s, v23.4h, v0.4h\n"
+    "smlal2 v9.4s, v23.8h, v0.8h\n"
+    "ldr d0, [x3, #0xa0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v18.4s, v26.4h, v1.4h\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "smlal v11.4s, v23.4h, v1.4h\n"
+    "smlal2 v8.4s, v23.8h, v1.8h\n"
+    "smlal v10.4s, v31.4h, v1.4h\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "ldr d1, [x3, #0xa8]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v20.4s, v26.8h, v2.8h\n"
+    "ldr d26, [x7, x10]\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v8.4s, v31.8h, v2.8h\n"
+    "smlal v10.4s, v30.4h, v2.4h\n"
+    "smlal2 v9.4s, v30.8h, v2.8h\n"
+    "ldr d2, [x3, #0xb0]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x26, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v11.4s, v30.4h, v3.4h\n"
+    "smlal2 v8.4s, v30.8h, v3.8h\n"
+    "smlal v10.4s, v28.4h, v3.4h\n"
+    "smlal2 v9.4s, v28.8h, v3.8h\n"
+    "ldr d3, [x3, #0xb8]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x23, x10]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v18.4s, v22.4h, v4.4h\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "smlal v11.4s, v28.4h, v4.4h\n"
+    "smlal2 v8.4s, v28.8h, v4.8h\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0xc0]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "smlal2 v20.4s, v27.8h, v0.8h\n"
+    "ldr d27, [x22, x10]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v18.4s, v23.4h, v0.4h\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v8.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x20, x10]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v0.4h\n"
+    "smlal2 v9.4s, v24.8h, v0.8h\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v20.4s, v23.8h, v1.8h\n"
+    "smlal v18.4s, v31.4h, v1.4h\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v8.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x13, x10]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v1.4h\n"
+    "smlal2 v9.4s, v27.8h, v1.8h\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v20.4s, v31.8h, v2.8h\n"
+    "smlal v18.4s, v30.4h, v2.4h\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "smlal v11.4s, v27.4h, v2.4h\n"
+    "smlal2 v8.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x10]\n"
+    "add x10, x10, #0x8\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v20.4s, v30.8h, v3.8h\n"
+    "smlal v18.4s, v28.4h, v3.4h\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v20.4s, v28.8h, v4.8h\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "smlal v10.4s, v27.4h, v4.4h\n"
+    "smlal2 v9.4s, v27.8h, v4.8h\n"
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+    "and v1.16b, v15.16b, v21.16b\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "and v29.16b, v20.16b, v14.16b\n"
+    "and v3.16b, v18.16b, v21.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v2.16b, v5.16b, v14.16b\n"
+    "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v1.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+    "and v0.16b, v11.16b, v21.16b\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "srshl v15.4s, v15.4s, v21.4s\n"
+    "sqadd v20.4s, v20.4s, v29.4s\n"
+    "sqadd v18.4s, v18.4s, v3.4s\n"
+    "sqadd v5.4s, v5.4s, v2.4s\n"
+    "and v27.16b, v8.16b, v14.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v19.4s\n"
+    "srshl v20.4s, v20.4s, v14.4s\n"
+    "srshl v18.4s, v18.4s, v21.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "smin v15.4s, v15.4s, v12.4s\n"
+    "add v20.4s, v20.4s, v19.4s\n"
+    "add v18.4s, v18.4s, v19.4s\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "add v5.4s, v5.4s, v19.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v5.4s, v5.4s, v12.4s\n"
+    "uzp1 v15.16b, v15.16b, v20.16b\n"
+    "sqadd v11.4s, v11.4s, v0.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x17, x1]\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "sqadd v8.4s, v8.4s, v27.4s\n"
+    "srshl v11.4s, v11.4s, v21.4s\n"
+    "and v30.16b, v10.16b, v21.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "uzp1 v18.16b, v18.16b, v5.16b\n"
+    "add v11.4s, v11.4s, v19.4s\n"
+    "srshl v8.4s, v8.4s, v14.4s\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str d18, [x16, x1]\n"
+    "smin v11.4s, v11.4s, v12.4s\n"
+    "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+    "add v8.4s, v8.4s, v19.4s\n"
+    "sqadd v10.4s, v10.4s, v30.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "smin v8.4s, v8.4s, v12.4s\n"
+    "and v6.16b, v9.16b, v14.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "srshl v10.4s, v10.4s, v21.4s\n"
+    "uzp1 v11.16b, v11.16b, v8.16b\n"
+    "add v10.4s, v10.4s, v19.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str d11, [x6, x1]\n"
+    "smin v10.4s, v10.4s, v12.4s\n"
+    "sqadd v9.4s, v9.4s, v6.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "srshl v9.4s, v9.4s, v14.4s\n"
+    "add v9.4s, v9.4s, v19.4s\n"
+    "smin v9.4s, v9.4s, v12.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "uzp1 v10.16b, v10.16b, v9.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "str d10, [x8, x1]\n"
+    "add x1, x1, #0x8\n"
+    "beq 124f\n"
+    "add x3, x3, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x4, #2, 5f\n"
+    "ld1 { v15.4s }, [x12], #0x10\n"
+    "tbz x4, #1, 4f\n"
+    "ld1 { v20.d }[0], [x12], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v20.s }[2], [x12]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v20.s }[0], [x12]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x4, #1, 6f\n"
+    "ld1 { v15.d }[0], [x12], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[2], [x12]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[0], [x12]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v18.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "mov v5.16b, v20.16b\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v11.16b, v15.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "mov v8.16b, v20.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "mov v10.16b, v15.16b\n"
+    "ldr d4, [x3, #0x20]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "mov v9.16b, v20.16b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "add x28, x28, x10\n"
+    "add x27, x27, x10\n"
+    "add x26, x26, x10\n"
+    "add x13, x13, x10\n"
+    "add x24, x24, x10\n"
+    "add x23, x23, x10\n"
+    "add x22, x22, x10\n"
+    "add x21, x21, x10\n"
+    "add x20, x20, x10\n"
+    "add x0, x0, x10\n"
+    "tbz x4, #2, 9f\n"
+    "ld1 { v31.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v29.s }[0], [x26], #0x4\n"
+    "ld1 { v28.s }[0], [x13], #0x4\n"
+    "ld1 { v27.s }[0], [x24], #0x4\n"
+    "ld1 { v23.s }[0], [x23], #0x4\n"
+    "ld1 { v25.s }[0], [x22], #0x4\n"
+    "ld1 { v24.s }[0], [x21], #0x4\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "ld1 { v22.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 8f\n"
+    "ld1 { v31.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v29.h }[2], [x26], #0x2\n"
+    "ld1 { v28.h }[2], [x13], #0x2\n"
+    "ld1 { v27.h }[2], [x24], #0x2\n"
+    "ld1 { v23.h }[2], [x23], #0x2\n"
+    "ld1 { v25.h }[2], [x22], #0x2\n"
+    "ld1 { v24.h }[2], [x21], #0x2\n"
+    "ld1 { v26.h }[2], [x20], #0x2\n"
+    "ld1 { v22.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v29.b }[6], [x26]\n"
+    "ld1 { v28.b }[6], [x13]\n"
+    "ld1 { v27.b }[6], [x24]\n"
+    "ld1 { v23.b }[6], [x23]\n"
+    "ld1 { v25.b }[6], [x22]\n"
+    "ld1 { v24.b }[6], [x21]\n"
+    "ld1 { v26.b }[6], [x20]\n"
+    "ld1 { v22.b }[6], [x0]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v29.b }[4], [x26]\n"
+    "ld1 { v28.b }[4], [x13]\n"
+    "ld1 { v27.b }[4], [x24]\n"
+    "ld1 { v23.b }[4], [x23]\n"
+    "ld1 { v25.b }[4], [x22]\n"
+    "ld1 { v24.b }[4], [x21]\n"
+    "ld1 { v26.b }[4], [x20]\n"
+    "ld1 { v22.b }[4], [x0]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x4, #1, 10f\n"
+    "ld1 { v31.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v29.h }[0], [x26], #0x2\n"
+    "ld1 { v28.h }[0], [x13], #0x2\n"
+    "ld1 { v27.h }[0], [x24], #0x2\n"
+    "ld1 { v23.h }[0], [x23], #0x2\n"
+    "ld1 { v25.h }[0], [x22], #0x2\n"
+    "ld1 { v24.h }[0], [x21], #0x2\n"
+    "ld1 { v26.h }[0], [x20], #0x2\n"
+    "ld1 { v22.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v29.b }[2], [x26]\n"
+    "ld1 { v28.b }[2], [x13]\n"
+    "ld1 { v27.b }[2], [x24]\n"
+    "ld1 { v23.b }[2], [x23]\n"
+    "ld1 { v25.b }[2], [x22]\n"
+    "ld1 { v24.b }[2], [x21]\n"
+    "ld1 { v26.b }[2], [x20]\n"
+    "ld1 { v22.b }[2], [x0]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v29.b }[0], [x26]\n"
+    "ld1 { v28.b }[0], [x13]\n"
+    "ld1 { v27.b }[0], [x24]\n"
+    "ld1 { v23.b }[0], [x23]\n"
+    "ld1 { v25.b }[0], [x22]\n"
+    "ld1 { v24.b }[0], [x21]\n"
+    "ld1 { v26.b }[0], [x20]\n"
+    "ld1 { v22.b }[0], [x0]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "ldr x20, [x25, #0x50]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "usubl v29.8h, v29.8b, v7.8b\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v11.4s, v29.4h, v0.4h\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal2 v8.4s, v29.8h, v0.8h\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v28.4h, v0.4h\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "usubl v22.8h, v22.8b, v7.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "add x20, x20, x10\n"
+    "smlal v18.4s, v27.4h, v1.4h\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "smlal2 v8.4s, v28.8h, v1.8h\n"
+    "smlal v10.4s, v23.4h, v1.4h\n"
+    "smlal2 v9.4s, v23.8h, v1.8h\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "smlal v11.4s, v23.4h, v2.4h\n"
+    "smlal2 v8.4s, v23.8h, v2.8h\n"
+    "tbz x4, #2, 13f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 12f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x4, #1, 14f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "ldr x28, [x25, #0x58]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "add x28, x28, x10\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v10.4s, v31.4h, v2.4h\n"
+    "smlal2 v9.4s, v31.8h, v2.8h\n"
+    "smlal v11.4s, v31.4h, v3.4h\n"
+    "smlal2 v8.4s, v31.8h, v3.8h\n"
+    "tbz x4, #2, 17f\n"
+    "ld1 { v30.s }[0], [x28], #0x4\n"
+    "tbz x4, #1, 16f\n"
+    "ld1 { v30.h }[2], [x28], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[6], [x28]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[4], [x28]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x4, #1, 18f\n"
+    "ld1 { v30.h }[0], [x28], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[2], [x28]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[0], [x28]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "ldr x0, [x25, #0x60]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "add x0, x0, x10\n"
+    "smlal v10.4s, v30.4h, v3.4h\n"
+    "smlal2 v9.4s, v30.8h, v3.8h\n"
+    "tbz x4, #2, 21f\n"
+    "ld1 { v27.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 20f\n"
+    "ld1 { v27.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[6], [x0]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[4], [x0]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x4, #1, 22f\n"
+    "ld1 { v27.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[2], [x0]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[0], [x0]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "smlal v11.4s, v30.4h, v4.4h\n"
+    "ldr d0, [x3, #0x28]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal2 v8.4s, v30.8h, v4.8h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "add x7, x7, x10\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v20.4s, v29.8h, v0.8h\n"
+    "smlal v18.4s, v28.4h, v0.4h\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "smlal v11.4s, v22.4h, v0.4h\n"
+    "smlal2 v8.4s, v22.8h, v0.8h\n"
+    "tbz x4, #2, 25f\n"
+    "ld1 { v25.s }[0], [x7], #0x4\n"
+    "tbz x4, #1, 24f\n"
+    "ld1 { v25.h }[2], [x7], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[6], [x7]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[4], [x7]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x4, #1, 26f\n"
+    "ld1 { v25.h }[0], [x7], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[2], [x7]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[0], [x7]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "ldr d1, [x3, #0x30]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v25.4h, v0.4h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal2 v9.4s, v25.8h, v0.8h\n"
+    "add x26, x26, x10\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "smlal v18.4s, v23.4h, v1.4h\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "smlal v11.4s, v25.4h, v1.4h\n"
+    "smlal2 v8.4s, v25.8h, v1.8h\n"
+    "tbz x4, #2, 29f\n"
+    "ld1 { v24.s }[0], [x26], #0x4\n"
+    "tbz x4, #1, 28f\n"
+    "ld1 { v24.h }[2], [x26], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[6], [x26]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[4], [x26]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x4, #1, 30f\n"
+    "ld1 { v24.h }[0], [x26], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[2], [x26]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[0], [x26]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "ldr d2, [x3, #0x38]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v1.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal2 v9.4s, v24.8h, v1.8h\n"
+    "add x23, x23, x10\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v20.4s, v23.8h, v2.8h\n"
+    "smlal v18.4s, v31.4h, v2.4h\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "smlal v11.4s, v24.4h, v2.4h\n"
+    "smlal2 v8.4s, v24.8h, v2.8h\n"
+    "tbz x4, #2, 33f\n"
+    "ld1 { v27.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 32f\n"
+    "ld1 { v27.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[6], [x23]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[4], [x23]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x4, #1, 34f\n"
+    "ld1 { v27.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[2], [x23]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[0], [x23]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "ldr d3, [x3, #0x40]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v2.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v9.4s, v27.8h, v2.8h\n"
+    "add x20, x20, x10\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v20.4s, v31.8h, v3.8h\n"
+    "smlal v18.4s, v30.4h, v3.4h\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "smlal v11.4s, v27.4h, v3.4h\n"
+    "smlal2 v8.4s, v27.8h, v3.8h\n"
+    "tbz x4, #2, 37f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 36f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x4, #1, 38f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "ldr d4, [x3, #0x48]\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v10.4s, v23.4h, v3.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v9.4s, v23.8h, v3.8h\n"
+    "add x22, x22, x10\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v20.4s, v30.8h, v4.8h\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "smlal v11.4s, v23.4h, v4.4h\n"
+    "smlal2 v8.4s, v23.8h, v4.8h\n"
+    "tbz x4, #2, 41f\n"
+    "ld1 { v28.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 40f\n"
+    "ld1 { v28.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[6], [x22]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[4], [x22]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x4, #1, 42f\n"
+    "ld1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[2], [x22]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[0], [x22]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "ldr d0, [x3, #0x50]\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v10.4s, v28.4h, v4.4h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "add x13, x13, x10\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v20.4s, v22.8h, v0.8h\n"
+    "smlal v18.4s, v25.4h, v0.4h\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "tbz x4, #2, 45f\n"
+    "ld1 { v31.s }[0], [x13], #0x4\n"
+    "tbz x4, #1, 44f\n"
+    "ld1 { v31.h }[2], [x13], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[6], [x13]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[4], [x13]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x4, #1, 46f\n"
+    "ld1 { v31.h }[0], [x13], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[2], [x13]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[0], [x13]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "ldr x21, [x25, #0x98]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v11.4s, v31.4h, v0.4h\n"
+    "smlal2 v8.4s, v31.8h, v0.8h\n"
+    "add x21, x21, x10\n"
+    "tbz x4, #2, 49f\n"
+    "ld1 { v30.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 48f\n"
+    "ld1 { v30.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[6], [x21]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[4], [x21]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x4, #1, 50f\n"
+    "ld1 { v30.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[2], [x21]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[0], [x21]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "ldr d1, [x3, #0x58]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v10.4s, v30.4h, v0.4h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal2 v9.4s, v30.8h, v0.8h\n"
+    "add x14, x14, x10\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v20.4s, v25.8h, v1.8h\n"
+    "smlal v18.4s, v24.4h, v1.4h\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "smlal v11.4s, v30.4h, v1.4h\n"
+    "smlal2 v8.4s, v30.8h, v1.8h\n"
+    "tbz x4, #2, 53f\n"
+    "ld1 { v26.s }[0], [x14], #0x4\n"
+    "tbz x4, #1, 52f\n"
+    "ld1 { v26.h }[2], [x14], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[6], [x14]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[4], [x14]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x4, #1, 54f\n"
+    "ld1 { v26.h }[0], [x14], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[2], [x14]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[0], [x14]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "ldr d2, [x3, #0x60]\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v10.4s, v26.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal2 v9.4s, v26.8h, v1.8h\n"
+    "add x11, x11, x10\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v20.4s, v24.8h, v2.8h\n"
+    "smlal v18.4s, v27.4h, v2.4h\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "smlal v11.4s, v26.4h, v2.4h\n"
+    "smlal2 v8.4s, v26.8h, v2.8h\n"
+    "tbz x4, #2, 57f\n"
+    "ld1 { v25.s }[0], [x11], #0x4\n"
+    "tbz x4, #1, 56f\n"
+    "ld1 { v25.h }[2], [x11], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[6], [x11]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[4], [x11]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x4, #1, 58f\n"
+    "ld1 { v25.h }[0], [x11], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[2], [x11]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[0], [x11]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "ldr d3, [x3, #0x68]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "add x24, x24, x10\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "smlal v18.4s, v23.4h, v3.4h\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "tbz x4, #2, 61f\n"
+    "ld1 { v24.s }[0], [x24], #0x4\n"
+    "tbz x4, #1, 60f\n"
+    "ld1 { v24.h }[2], [x24], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[6], [x24]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[4], [x24]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x4, #1, 62f\n"
+    "ld1 { v24.h }[0], [x24], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[2], [x24]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[0], [x24]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "ldr d4, [x3, #0x70]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "add x0, x0, x10\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v20.4s, v23.8h, v4.8h\n"
+    "smlal v18.4s, v28.4h, v4.4h\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "tbz x4, #2, 65f\n"
+    "ld1 { v22.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 64f\n"
+    "ld1 { v22.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[6], [x0]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[4], [x0]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x4, #1, 66f\n"
+    "ld1 { v22.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[2], [x0]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[0], [x0]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "ldr d0, [x3, #0x78]\n"
+    "usubl v22.8h, v22.8b, v7.8b\n"
+    "smlal v10.4s, v22.4h, v4.4h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal2 v9.4s, v22.8h, v4.8h\n"
+    "add x15, x15, x10\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v20.4s, v31.8h, v0.8h\n"
+    "smlal v18.4s, v30.4h, v0.4h\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "tbz x4, #2, 69f\n"
+    "ld1 { v27.s }[0], [x15], #0x4\n"
+    "tbz x4, #1, 68f\n"
+    "ld1 { v27.h }[2], [x15], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[6], [x15]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[4], [x15]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x4, #1, 70f\n"
+    "ld1 { v27.h }[0], [x15], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[2], [x15]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[0], [x15]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "ldr x9, [x25, #0xc8]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v11.4s, v27.4h, v0.4h\n"
+    "smlal2 v8.4s, v27.8h, v0.8h\n"
+    "add x9, x9, x10\n"
+    "tbz x4, #2, 73f\n"
+    "ld1 { v23.s }[0], [x9], #0x4\n"
+    "tbz x4, #1, 72f\n"
+    "ld1 { v23.h }[2], [x9], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[6], [x9]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[4], [x9]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x4, #1, 74f\n"
+    "ld1 { v23.h }[0], [x9], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[2], [x9]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[0], [x9]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "ldr d1, [x3, #0x80]\n"
+    "usubl v23.8h, v23.8b, v7.8b\n"
+    "smlal v10.4s, v23.4h, v0.4h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal2 v9.4s, v23.8h, v0.8h\n"
+    "add x27, x27, x10\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v20.4s, v30.8h, v1.8h\n"
+    "smlal v18.4s, v26.4h, v1.4h\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "smlal v11.4s, v23.4h, v1.4h\n"
+    "smlal2 v8.4s, v23.8h, v1.8h\n"
+    "tbz x4, #2, 77f\n"
+    "ld1 { v31.s }[0], [x27], #0x4\n"
+    "tbz x4, #1, 76f\n"
+    "ld1 { v31.h }[2], [x27], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[6], [x27]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[4], [x27]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x4, #1, 78f\n"
+    "ld1 { v31.h }[0], [x27], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[2], [x27]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[0], [x27]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "ldr d2, [x3, #0x88]\n"
+    "usubl v31.8h, v31.8b, v7.8b\n"
+    "smlal v10.4s, v31.4h, v1.4h\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "add x28, x28, x10\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v20.4s, v26.8h, v2.8h\n"
+    "smlal v18.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v8.4s, v31.8h, v2.8h\n"
+    "tbz x4, #2, 81f\n"
+    "ld1 { v30.s }[0], [x28], #0x4\n"
+    "tbz x4, #1, 80f\n"
+    "ld1 { v30.h }[2], [x28], #0x2\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[6], [x28]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[4], [x28]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x4, #1, 82f\n"
+    "ld1 { v30.h }[0], [x28], #0x2\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[2], [x28]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[0], [x28]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "ldr d3, [x3, #0x90]\n"
+    "usubl v30.8h, v30.8b, v7.8b\n"
+    "smlal v10.4s, v30.4h, v2.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v9.4s, v30.8h, v2.8h\n"
+    "add x12, x12, x10\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v20.4s, v25.8h, v3.8h\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v11.4s, v30.4h, v3.4h\n"
+    "smlal2 v8.4s, v30.8h, v3.8h\n"
+    "tbz x4, #2, 85f\n"
+    "ld1 { v28.s }[0], [x12], #0x4\n"
+    "tbz x4, #1, 84f\n"
+    "ld1 { v28.h }[2], [x12], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[6], [x12]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[4], [x12]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x4, #1, 86f\n"
+    "ld1 { v28.h }[0], [x12], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[2], [x12]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[0], [x12]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "ldr d4, [x3, #0x98]\n"
+    "usubl v28.8h, v28.8b, v7.8b\n"
+    "smlal v10.4s, v28.4h, v3.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v9.4s, v28.8h, v3.8h\n"
+    "add x7, x7, x10\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v20.4s, v24.8h, v4.8h\n"
+    "smlal v18.4s, v22.4h, v4.4h\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "smlal v11.4s, v28.4h, v4.4h\n"
+    "smlal2 v8.4s, v28.8h, v4.8h\n"
+    "tbz x4, #2, 89f\n"
+    "ld1 { v26.s }[0], [x7], #0x4\n"
+    "tbz x4, #1, 88f\n"
+    "ld1 { v26.h }[2], [x7], #0x2\n"
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[6], [x7]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[4], [x7]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x4, #1, 90f\n"
+    "ld1 { v26.h }[0], [x7], #0x2\n"
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[2], [x7]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[0], [x7]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "ldr d0, [x3, #0xa0]\n"
+    "usubl v26.8h, v26.8b, v7.8b\n"
+    "smlal v10.4s, v26.4h, v4.4h\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "usubl v0.8h, v0.8b, v13.8b\n"
+    "smlal2 v9.4s, v26.8h, v4.8h\n"
+    "add x26, x26, x10\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "smlal2 v20.4s, v27.8h, v0.8h\n"
+    "smlal v18.4s, v23.4h, v0.4h\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "tbz x4, #2, 93f\n"
+    "ld1 { v25.s }[0], [x26], #0x4\n"
+    "tbz x4, #1, 92f\n"
+    "ld1 { v25.h }[2], [x26], #0x2\n"
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[6], [x26]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[4], [x26]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x4, #1, 94f\n"
+    "ld1 { v25.h }[0], [x26], #0x2\n"
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[2], [x26]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[0], [x26]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "ldr x23, [x25, #0xf8]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v8.4s, v25.8h, v0.8h\n"
+    "add x23, x23, x10\n"
+    "tbz x4, #2, 97f\n"
+    "ld1 { v24.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 96f\n"
+    "ld1 { v24.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[6], [x23]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[4], [x23]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x4, #1, 98f\n"
+    "ld1 { v24.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[2], [x23]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[0], [x23]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "ldr d1, [x3, #0xa8]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v0.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "usubl v1.8h, v1.8b, v13.8b\n"
+    "smlal2 v9.4s, v24.8h, v0.8h\n"
+    "add x22, x22, x10\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v20.4s, v23.8h, v1.8h\n"
+    "smlal v18.4s, v31.4h, v1.4h\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v8.4s, v24.8h, v1.8h\n"
+    "tbz x4, #2, 101f\n"
+    "ld1 { v27.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 100f\n"
+    "ld1 { v27.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[6], [x22]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[4], [x22]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x4, #1, 102f\n"
+    "ld1 { v27.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[2], [x22]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[0], [x22]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "ldr d2, [x3, #0xb0]\n"
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v1.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "usubl v2.8h, v2.8b, v13.8b\n"
+    "smlal2 v9.4s, v27.8h, v1.8h\n"
+    "add x20, x20, x10\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v20.4s, v31.8h, v2.8h\n"
+    "smlal v18.4s, v30.4h, v2.4h\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "smlal v11.4s, v27.4h, v2.4h\n"
+    "smlal2 v8.4s, v27.8h, v2.8h\n"
+    "tbz x4, #2, 105f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 104f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x4, #1, 106f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "ldr d3, [x3, #0xb8]\n"
+    "usubl v25.8h, v25.8b, v7.8b\n"
+    "smlal v10.4s, v25.4h, v2.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "usubl v3.8h, v3.8b, v13.8b\n"
+    "smlal2 v9.4s, v25.8h, v2.8h\n"
+    "add x13, x13, x10\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v20.4s, v30.8h, v3.8h\n"
+    "smlal v18.4s, v28.4h, v3.4h\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "smlal v11.4s, v25.4h, v3.4h\n"
+    "smlal2 v8.4s, v25.8h, v3.8h\n"
+    "tbz x4, #2, 109f\n"
+    "ld1 { v24.s }[0], [x13], #0x4\n"
+    "tbz x4, #1, 108f\n"
+    "ld1 { v24.h }[2], [x13], #0x2\n"
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[6], [x13]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[4], [x13]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x4, #1, 110f\n"
+    "ld1 { v24.h }[0], [x13], #0x2\n"
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[2], [x13]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[0], [x13]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "ldr d4, [x3, #0xc0]\n"
+    "usubl v24.8h, v24.8b, v7.8b\n"
+    "smlal v10.4s, v24.4h, v3.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "usubl v4.8h, v4.8b, v13.8b\n"
+    "smlal2 v9.4s, v24.8h, v3.8h\n"
+    "add x21, x21, x10\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v20.4s, v28.8h, v4.8h\n"
+    "smlal v18.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "smlal v11.4s, v24.4h, v4.4h\n"
+    "smlal2 v8.4s, v24.8h, v4.8h\n"
+    "tbz x4, #2, 113f\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 112f\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x4, #1, 114f\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "usubl v27.8h, v27.8b, v7.8b\n"
+    "smlal v10.4s, v27.4h, v4.4h\n"
+    "smlal2 v9.4s, v27.8h, v4.8h\n"
+    "tbz x4, #2, 117f\n"
+    "ld1 { v6.4s }, [x2], #0x10\n"
+    "ld1 { v21.4s }, [x5], #0x10\n"
+    "tbz x4, #1, 116f\n"
+    "ld1 { v17.d }[0], [x2], #0x8\n"
+    "ld1 { v14.d }[0], [x5], #0x8\n"
+    "tbz x4, #0, 119f\n"
+    "ld1 { v17.s }[2], [x2]\n"
+    "ld1 { v14.s }[2], [x5]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 119f\n"
+    "ld1 { v17.s }[0], [x2]\n"
+    "ld1 { v14.s }[0], [x5]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x4, #1, 118f\n"
+    "ld1 { v6.d }[0], [x2], #0x8\n"
+    "ld1 { v21.d }[0], [x5], #0x8\n"
+    "tbz x4, #0, 119f\n"
+    "ld1 { v6.s }[2], [x2]\n"
+    "ld1 { v21.s }[2], [x5]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 119f\n"
+    "ld1 { v6.s }[0], [x2]\n"
+    "ld1 { v21.s }[0], [x5]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "add x17, x17, x1\n"
+    "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+    "add x16, x16, x1\n"
+    "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+    "add x6, x6, x1\n"
+    "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+    "add x8, x8, x1\n"
+    "sqrdmulh v11.4s, v11.4s, v6.4s\n"
+    "and v1.16b, v15.16b, v21.16b\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "and v29.16b, v20.16b, v14.16b\n"
+    "and v3.16b, v18.16b, v21.16b\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "and v2.16b, v5.16b, v14.16b\n"
+    "and v0.16b, v11.16b, v21.16b\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+    "sshr v2.4s, v2.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v1.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+    "sqadd v20.4s, v20.4s, v29.4s\n"
+    "sqadd v18.4s, v18.4s, v3.4s\n"
+    "srshl v15.4s, v15.4s, v21.4s\n"
+    "sqadd v5.4s, v5.4s, v2.4s\n"
+    "srshl v20.4s, v20.4s, v14.4s\n"
+    "srshl v18.4s, v18.4s, v21.4s\n"
+    "add v15.4s, v15.4s, v19.4s\n"
+    "srshl v5.4s, v5.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v19.4s\n"
+    "smin v15.4s, v15.4s, v12.4s\n"
+    "add v18.4s, v18.4s, v19.4s\n"
+    "smin v20.4s, v20.4s, v12.4s\n"
+    "smax v15.4s, v15.4s, v16.4s\n"
+    "smin v18.4s, v18.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "add v5.4s, v5.4s, v19.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "uzp1 v15.16b, v15.16b, v20.16b\n"
+    "smin v5.4s, v5.4s, v12.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "sqadd v11.4s, v11.4s, v0.4s\n"
+    "smax v5.4s, v5.4s, v16.4s\n"
+    "and v27.16b, v8.16b, v14.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "uzp1 v18.16b, v18.16b, v5.16b\n"
+    "srshl v11.4s, v11.4s, v21.4s\n"
+    "and v30.16b, v10.16b, v21.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "add v11.4s, v11.4s, v19.4s\n"
+    "sqadd v8.4s, v8.4s, v27.4s\n"
+    "and v6.16b, v9.16b, v14.16b\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "smin v11.4s, v11.4s, v12.4s\n"
+    "srshl v8.4s, v8.4s, v14.4s\n"
+    "sqadd v10.4s, v10.4s, v30.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "add v8.4s, v8.4s, v19.4s\n"
+    "srshl v10.4s, v10.4s, v21.4s\n"
+    "sqadd v9.4s, v9.4s, v6.4s\n"
+    "smin v8.4s, v8.4s, v12.4s\n"
+    "add v10.4s, v10.4s, v19.4s\n"
+    "srshl v9.4s, v9.4s, v14.4s\n"
+    "smax v8.4s, v8.4s, v16.4s\n"
+    "smin v10.4s, v10.4s, v12.4s\n"
+    "uzp1 v11.16b, v11.16b, v8.16b\n"
+    "add v9.4s, v9.4s, v19.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "smin v9.4s, v9.4s, v12.4s\n"
+    "smax v9.4s, v9.4s, v16.4s\n"
+    "uzp1 v10.16b, v10.16b, v9.16b\n"
+    "uzp1 v10.16b, v10.16b, v10.16b\n"
+    "tbz x4, #2, 121f\n"
+    "st1 { v15.s }[0], [x17], #0x4\n"
+    "st1 { v18.s }[0], [x16], #0x4\n"
+    "st1 { v11.s }[0], [x6], #0x4\n"
+    "st1 { v10.s }[0], [x8], #0x4\n"
+    "tbz x4, #1, 120f\n"
+    "st1 { v15.h }[2], [x17], #0x2\n"
+    "st1 { v18.h }[2], [x16], #0x2\n"
+    "st1 { v11.h }[2], [x6], #0x2\n"
+    "st1 { v10.h }[2], [x8], #0x2\n"
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[6], [x17], #0x1\n"
+    "st1 { v18.b }[6], [x16], #0x1\n"
+    "st1 { v11.b }[6], [x6], #0x1\n"
+    "st1 { v10.b }[6], [x8], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[4], [x17], #0x1\n"
+    "st1 { v18.b }[4], [x16], #0x1\n"
+    "st1 { v11.b }[4], [x6], #0x1\n"
+    "st1 { v10.b }[4], [x8], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x4, #1, 122f\n"
+    "st1 { v15.h }[0], [x17], #0x2\n"
+    "st1 { v18.h }[0], [x16], #0x2\n"
+    "st1 { v11.h }[0], [x6], #0x2\n"
+    "st1 { v10.h }[0], [x8], #0x2\n"
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[2], [x17], #0x1\n"
+    "st1 { v18.b }[2], [x16], #0x1\n"
+    "st1 { v11.b }[2], [x6], #0x1\n"
+    "st1 { v10.b }[2], [x8], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[0], [x17], #0x1\n"
+    "st1 { v18.b }[0], [x16], #0x1\n"
+    "st1 { v11.b }[0], [x6], #0x1\n"
+    "st1 { v10.b }[0], [x8], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+
+    "124:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000..f5459c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+struct a64_u8q_nhwc_generic_output9_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int n_output_points = 9;
+
+  kern_type kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..42d9b2f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp

@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v12.4s }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "ld1r { v10.16b }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v9.16b }, [x20]\n"
+    "ld1r { v8.4s }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "ld1r { v6.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "mov x11, #0x0\n"
+    "ld1r { v5.4s }, [x19]\n"
+    "lsr x10, %x[n_channels], #0x2\n"
+    "cbz x10, 6f\n"
+    "1:"  // Channel loop
+    "movi v27.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x19, x11, #0x2\n"
+    "ldr q27, [%x[bias], x19]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "mov v26.16b, v27.16b\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v25.16b, v27.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, %x[n_points], #0x1\n"
+    "mov v24.16b, v27.16b\n"
+    "ldr s4, [x9, x11]\n"
+    "mov v23.16b, v27.16b\n"
+    "mov v22.16b, v27.16b\n"
+    "ldr s3, [x28, x11]\n"
+    "mov v21.16b, v27.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v20.16b, v27.16b\n"
+    "ldr s2, [x27, x11]\n"
+    "mov v19.16b, v27.16b\n"
+    "usubl v16.8h, v16.8b, v9.8b\n"
+    "ldr s1, [x26, x11]\n"
+    "usubl v4.8h, v4.8b, v10.8b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "usubl v3.8h, v3.8b, v10.8b\n"
+    "ldr s0, [x25, x11]\n"
+    "usubl v2.8h, v2.8b, v10.8b\n"
+    "usubl v1.8h, v1.8b, v10.8b\n"
+    "ldr s31, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "usubl v0.8h, v0.8b, v10.8b\n"
+    "ldr s30, [x23, x11]\n"
+    "ldr s29, [x22, x11]\n"
+    "usubl v31.8h, v31.8b, v10.8b\n"
+    "ldr x21, [x20], #0x8\n"
+    "usubl v30.8h, v30.8b, v10.8b\n"
+    "ldr s28, [x21, x11]\n"
+    "usubl v29.8h, v29.8b, v10.8b\n"
+    "usubl v28.8h, v28.8b, v10.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, x19, #0x1\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "ldr s4, [x9, x11]\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "ldr s3, [x28, x11]\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "ldr s2, [x27, x11]\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "usubl v4.8h, v4.8b, v10.8b\n"
+    "ldr s1, [x26, x11]\n"
+    "usubl v3.8h, v3.8b, v10.8b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "usubl v2.8h, v2.8b, v10.8b\n"
+    "ldr s0, [x25, x11]\n"
+    "usubl v16.8h, v16.8b, v9.8b\n"
+    "usubl v1.8h, v1.8b, v10.8b\n"
+    "ldr s31, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "usubl v0.8h, v0.8b, v10.8b\n"
+    "ldr s30, [x23, x11]\n"
+    "ldr s29, [x22, x11]\n"
+    "usubl v31.8h, v31.8b, v10.8b\n"
+    "ldr x21, [x20], #0x8\n"
+    "usubl v30.8h, v30.8b, v10.8b\n"
+    "ldr s28, [x21, x11]\n"
+    "usubl v29.8h, v29.8b, v10.8b\n"
+    "usubl v28.8h, v28.8b, v10.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x19, x11, #0x2\n"
+    "ldr q6, [%x[rq_mul_ptr], x19]\n"
+    "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v27.4s, v27.4s, v7.4s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "sshl v26.4s, v26.4s, v7.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v7.4s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+    "sshl v24.4s, v24.4s, v7.4s\n"
+    "and v16.16b, v27.16b, v5.16b\n"
+    "and v18.16b, v26.16b, v5.16b\n"
+    "and v17.16b, v25.16b, v5.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sqadd v26.4s, v26.4s, v18.4s\n"
+    "sqadd v25.4s, v25.4s, v17.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+    "srshl v27.4s, v27.4s, v5.4s\n"
+    "srshl v26.4s, v26.4s, v5.4s\n"
+    "srshl v25.4s, v25.4s, v5.4s\n"
+    "and v16.16b, v24.16b, v5.16b\n"
+    "add v27.4s, v27.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v8.4s\n"
+    "add v25.4s, v25.4s, v8.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v27.4s, v27.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v12.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "smin v27.4s, v27.4s, v11.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v25.4s, v25.4s, v12.4s\n"
+    "srshl v24.4s, v24.4s, v5.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x27, x11]\n"
+    "add v24.4s, v24.4s, v8.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x26, x11]\n"
+    "smax v24.4s, v24.4s, v12.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x25, x11]\n"
+    "sshl v23.4s, v23.4s, v7.4s\n"
+    "sshl v22.4s, v22.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sshl v21.4s, v21.4s, v7.4s\n"
+    "and v17.16b, v23.16b, v5.16b\n"
+    "and v16.16b, v22.16b, v5.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x24, x11]\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v21.16b, v5.16b\n"
+    "sshl v20.4s, v20.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v7.4s\n"
+    "srshl v23.4s, v23.4s, v5.4s\n"
+    "srshl v22.4s, v22.4s, v5.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+    "add v23.4s, v23.4s, v8.4s\n"
+    "add v22.4s, v22.4s, v8.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "and v17.16b, v20.16b, v5.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+    "smax v23.4s, v23.4s, v12.4s\n"
+    "srshl v21.4s, v21.4s, v5.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v19.16b, v5.16b\n"
+    "smin v23.4s, v23.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v8.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "smax v21.4s, v21.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v5.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v5.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x23, x11]\n"
+    "add v19.4s, v19.4s, v8.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x22, x11]\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x21, x11]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x20, x11]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x19, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x10, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v27.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x19, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v27.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v27.s }[2], [x19], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v27.s }[0], [x19], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+
+    "9:"  // Oddments: Load bias: Done
+    "mov v26.16b, v27.16b\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v25.16b, v27.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add x9, x9, x11\n"
+    "mov v24.16b, v27.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v23.16b, v27.16b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "mov v22.16b, v27.16b\n"
+    "add x28, x28, x11\n"
+    "mov v21.16b, v27.16b\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "mov v20.16b, v27.16b\n"
+    "add x27, x27, x11\n"
+    "mov v19.16b, v27.16b\n"
+    "ldr x21, [x20], #0x8\n"
+    "usubl v16.8h, v16.8b, v9.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h4, [x9], #0x2\n"
+    "ldr h3, [x28], #0x2\n"
+    "ldr h2, [x27], #0x2\n"
+    "ldr h1, [x26], #0x2\n"
+    "ldr h0, [x25], #0x2\n"
+    "ldr h31, [x24], #0x2\n"
+    "ldr h30, [x23], #0x2\n"
+    "ldr h29, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v4.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x28], #0x1\n"
+    "ld1 { v2.b }[2], [x27], #0x1\n"
+    "ld1 { v1.b }[2], [x26], #0x1\n"
+    "ld1 { v0.b }[2], [x25], #0x1\n"
+    "ld1 { v31.b }[2], [x24], #0x1\n"
+    "ld1 { v30.b }[2], [x23], #0x1\n"
+    "ld1 { v29.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ldr b4, [x9], #0x1\n"
+    "ldr b3, [x28], #0x1\n"
+    "ldr b2, [x27], #0x1\n"
+    "ldr b1, [x26], #0x1\n"
+    "ldr b0, [x25], #0x1\n"
+    "ldr b31, [x24], #0x1\n"
+    "ldr b30, [x23], #0x1\n"
+    "ldr b29, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "usubl v4.8h, v4.8b, v10.8b\n"
+    "subs x19, %x[n_points], #0x1\n"
+    "usubl v3.8h, v3.8b, v10.8b\n"
+    "usubl v2.8h, v2.8b, v10.8b\n"
+    "usubl v1.8h, v1.8b, v10.8b\n"
+    "usubl v0.8h, v0.8b, v10.8b\n"
+    "usubl v31.8h, v31.8b, v10.8b\n"
+    "usubl v30.8h, v30.8b, v10.8b\n"
+    "usubl v29.8h, v29.8b, v10.8b\n"
+    "usubl v28.8h, v28.8b, v10.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add x9, x9, x11\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "add x28, x28, x11\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "add x27, x27, x11\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "ldr x21, [x20], #0x8\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "add x26, x26, x11\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "add x25, x25, x11\n"
+    "usubl v16.8h, v16.8b, v9.8b\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h4, [x9], #0x2\n"
+    "ldr h3, [x28], #0x2\n"
+    "ldr h2, [x27], #0x2\n"
+    "ldr h1, [x26], #0x2\n"
+    "ldr h0, [x25], #0x2\n"
+    "ldr h31, [x24], #0x2\n"
+    "ldr h30, [x23], #0x2\n"
+    "ldr h29, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v4.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x28], #0x1\n"
+    "ld1 { v2.b }[2], [x27], #0x1\n"
+    "ld1 { v1.b }[2], [x26], #0x1\n"
+    "ld1 { v0.b }[2], [x25], #0x1\n"
+    "ld1 { v31.b }[2], [x24], #0x1\n"
+    "ld1 { v30.b }[2], [x23], #0x1\n"
+    "ld1 { v29.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "tbz %x[n_channels], #0, 14f\n"
+    "ldr b4, [x9], #0x1\n"
+    "ldr b3, [x28], #0x1\n"
+    "ldr b2, [x27], #0x1\n"
+    "ldr b1, [x26], #0x1\n"
+    "ldr b0, [x25], #0x1\n"
+    "ldr b31, [x24], #0x1\n"
+    "ldr b30, [x23], #0x1\n"
+    "ldr b29, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "usubl v4.8h, v4.8b, v10.8b\n"
+    "subs x19, x19, #0x1\n"
+    "usubl v3.8h, v3.8b, v10.8b\n"
+    "usubl v2.8h, v2.8b, v10.8b\n"
+    "usubl v1.8h, v1.8b, v10.8b\n"
+    "usubl v0.8h, v0.8b, v10.8b\n"
+    "usubl v31.8h, v31.8b, v10.8b\n"
+    "usubl v30.8h, v30.8b, v10.8b\n"
+    "usubl v29.8h, v29.8b, v10.8b\n"
+    "usubl v28.8h, v28.8b, v10.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v6.d }[0], [x21], #0x8\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v7.d }[0], [x19], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v6.s }[2], [x21], #0x4\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v7.s }[2], [x19], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v6.s }[0], [x21], #0x4\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v7.s }[0], [x19], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v27.4s, v27.4s, v7.4s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "add x27, x27, x11\n"
+    "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "sshl v26.4s, v26.4s, v7.4s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "add x26, x26, x11\n"
+    "sshl v25.4s, v25.4s, v7.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "sshl v24.4s, v24.4s, v7.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x25, x25, x11\n"
+    "and v16.16b, v27.16b, v5.16b\n"
+    "add x24, x24, x11\n"
+    "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+    "add x23, x23, x11\n"
+    "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+    "add x22, x22, x11\n"
+    "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+    "add x21, x21, x11\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x20, x20, x11\n"
+    "and v18.16b, v26.16b, v5.16b\n"
+    "add x19, x19, x11\n"
+    "and v17.16b, v25.16b, v5.16b\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v24.16b, v5.16b\n"
+    "srshl v27.4s, v27.4s, v5.4s\n"
+    "sqadd v26.4s, v26.4s, v18.4s\n"
+    "sqadd v25.4s, v25.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v27.4s, v27.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v5.4s\n"
+    "srshl v25.4s, v25.4s, v5.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "smax v27.4s, v27.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v8.4s\n"
+    "add v25.4s, v25.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v5.4s\n"
+    "smin v27.4s, v27.4s, v11.4s\n"
+    "smax v26.4s, v26.4s, v12.4s\n"
+    "smax v25.4s, v25.4s, v12.4s\n"
+    "add v24.4s, v24.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "smax v24.4s, v24.4s, v12.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sshl v23.4s, v23.4s, v7.4s\n"
+    "sshl v22.4s, v22.4s, v7.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+    "sshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v20.4s, v20.4s, v7.4s\n"
+    "and v17.16b, v23.16b, v5.16b\n"
+    "and v16.16b, v22.16b, v5.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v21.16b, v5.16b\n"
+    "and v17.16b, v20.16b, v5.16b\n"
+    "srshl v23.4s, v23.4s, v5.4s\n"
+    "srshl v22.4s, v22.4s, v5.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add v23.4s, v23.4s, v8.4s\n"
+    "add v22.4s, v22.4s, v8.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "smax v23.4s, v23.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "srshl v21.4s, v21.4s, v5.4s\n"
+    "srshl v20.4s, v20.4s, v5.4s\n"
+    "smin v23.4s, v23.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v8.4s\n"
+    "add v20.4s, v20.4s, v8.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v21.4s, v21.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "sshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+    "and v16.16b, v19.16b, v5.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v5.4s\n"
+    "add v19.4s, v19.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v27.h }[0], [x27], #0x2\n"
+    "st1 { v26.h }[0], [x26], #0x2\n"
+    "st1 { v25.h }[0], [x25], #0x2\n"
+    "st1 { v24.h }[0], [x24], #0x2\n"
+    "st1 { v23.h }[0], [x23], #0x2\n"
+    "st1 { v22.h }[0], [x22], #0x2\n"
+    "st1 { v21.h }[0], [x21], #0x2\n"
+    "st1 { v20.h }[0], [x20], #0x2\n"
+    "st1 { v19.h }[0], [x19], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v27.b }[2], [x27], #0x1\n"
+    "st1 { v26.b }[2], [x26], #0x1\n"
+    "st1 { v25.b }[2], [x25], #0x1\n"
+    "st1 { v24.b }[2], [x24], #0x1\n"
+    "st1 { v23.b }[2], [x23], #0x1\n"
+    "st1 { v22.b }[2], [x22], #0x1\n"
+    "st1 { v21.b }[2], [x21], #0x1\n"
+    "st1 { v20.b }[2], [x20], #0x1\n"
+    "st1 { v19.b }[2], [x19], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v27.b }[0], [x27], #0x1\n"
+    "st1 { v26.b }[0], [x26], #0x1\n"
+    "st1 { v25.b }[0], [x25], #0x1\n"
+    "st1 { v24.b }[0], [x24], #0x1\n"
+    "st1 { v23.b }[0], [x23], #0x1\n"
+    "st1 { v22.b }[0], [x22], #0x1\n"
+    "st1 { v21.b }[0], [x21], #0x1\n"
+    "st1 { v20.b }[0], [x20], #0x1\n"
+    "st1 { v19.b }[0], [x19], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+
+    "24:"  // End
+
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000..e8ac603
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+{
+  typedef uint32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 9;
+  constexpr static unsigned int input_col_quads = 1;
+
+  kern_type kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+
+  a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..2106cf7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp

@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "movi v5.16b, #0x1\n"
+    "ldr x22, [%x[inptrs], #0x0]\n"
+    "add SP, SP, #-0x80\n"
+    "ushr v5.4s, v5.4s, #0x8\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "movi v26.4s, #0x0\n"
+    "ldr x19, [%x[inptrs], #0x10]\n"
+    "mov x11, #0x0\n"
+    "movi v1.4s, #0x0\n"
+    "ld1 { v15.16b }, [x22]\n"
+    "mov x10, #0x0\n"
+    "movi v22.4s, #0x0\n"
+    "ld1 { v29.16b }, [x20]\n"
+    "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "movi v25.4s, #0x0\n"
+    "ld1 { v0.16b }, [x19]\n"
+    "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "movi v13.4s, #0x0\n"
+    "ldr x20, [%x[inptrs], #0x18]\n"
+    "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "mov v20.16b, v15.16b\n"
+    "ldr x19, [%x[inptrs], #0x20]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "ext v20.16b, v20.16b, v20.16b, #0x2\n"
+    "ld1r { v4.4s }, [x21]\n"
+    "mov v17.16b, v15.16b\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "ld1 { v7.16b }, [x19]\n"
+    "mov v23.16b, v15.16b\n"
+    "ldp x26, x25, [%x[outptrs], #0x0]\n"
+    "ext v23.16b, v23.16b, v23.16b, #0x6\n"
+    "ldp x24, x23, [%x[outptrs], #0x10]\n"
+    "mov v18.16b, v29.16b\n"
+    "ldp x22, x21, [%x[outptrs], #0x20]\n"
+    "zip1 v15.4s, v15.4s, v17.4s\n"
+    "ldp x20, x19, [%x[outptrs], #0x30]\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x2\n"
+    "ld1r { v14.4s }, [x9]\n"
+    "zip1 v20.4s, v20.4s, v23.4s\n"
+    "ld1r { v27.4s }, [x28]\n"
+    "zip1 v15.4s, v15.4s, v20.4s\n"
+    "ld1r { v23.4s }, [x27]\n"
+    "mov v17.16b, v29.16b\n"
+    "ldr q6, [%x[params], #0x0]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "ldr q8, [%x[params], #0x10]\n"
+    "mov v11.16b, v29.16b\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x6\n"
+    "ldr q10, [%x[params], #0x30]\n"
+    "add %x[params], %x[params], #0x40\n"
+    "zip1 v29.4s, v29.4s, v17.4s\n"
+    "mov v12.16b, v0.16b\n"
+    "ext v12.16b, v12.16b, v12.16b, #0x2\n"
+    "zip1 v18.4s, v18.4s, v11.4s\n"
+    "zip1 v29.4s, v29.4s, v18.4s\n"
+    "mov v17.16b, v0.16b\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "mov v11.16b, v0.16b\n"
+    "ext v11.16b, v11.16b, v11.16b, #0x6\n"
+    "mov v18.16b, v2.16b\n"
+    "zip1 v0.4s, v0.4s, v17.4s\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x2\n"
+    "zip1 v12.4s, v12.4s, v11.4s\n"
+    "zip1 v0.4s, v0.4s, v12.4s\n"
+    "mov v17.16b, v2.16b\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+    "mov v19.16b, v2.16b\n"
+    "ext v19.16b, v19.16b, v19.16b, #0x6\n"
+    "mov v28.16b, v7.16b\n"
+    "zip1 v2.4s, v2.4s, v17.4s\n"
+    "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+    "zip1 v18.4s, v18.4s, v19.4s\n"
+    "zip1 v2.4s, v2.4s, v18.4s\n"
+    "mov v18.16b, v7.16b\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x4\n"
+    "mov v21.16b, v7.16b\n"
+    "ext v21.16b, v21.16b, v21.16b, #0x6\n"
+    "movi v30.4s, #0x0\n"
+    "zip1 v7.4s, v7.4s, v18.4s\n"
+    "movi v3.4s, #0x0\n"
+    "zip1 v28.4s, v28.4s, v21.4s\n"
+    "zip1 v7.4s, v7.4s, v28.4s\n"
+    "movi v12.4s, #0x0\n"
+    "movi v11.4s, #0x0\n"
+    "movi v19.4s, #0x0\n"
+    "movi v21.4s, #0x0\n"
+    "movi v17.4s, #0x0\n"
+    "movi v16.4s, #0x0\n"
+    "movi v28.4s, #0x0\n"
+    "movi v18.4s, #0x0\n"
+    "movi v20.4s, #0x0\n"
+    "movi v24.4s, #0x0\n"
+    "movi v31.4s, #0x0\n"
+    ".inst 0x6f8fe0ba  // udot v26.4s, v5.16b, v15.4b[0]\n"
+    ".inst 0x6fafe0a1  // udot v1.4s, v5.16b, v15.4b[1]\n"
+    ".inst 0x6f8fe8b6  // udot v22.4s, v5.16b, v15.4b[2]\n"
+    ".inst 0x6fafe8b9  // udot v25.4s, v5.16b, v15.4b[3]\n"
+    ".inst 0x6f9de0ad  // udot v13.4s, v5.16b, v29.4b[0]\n"
+    ".inst 0x6fbde0be  // udot v30.4s, v5.16b, v29.4b[1]\n"
+    ".inst 0x6f9de8a3  // udot v3.4s, v5.16b, v29.4b[2]\n"
+    ".inst 0x6fbde8ac  // udot v12.4s, v5.16b, v29.4b[3]\n"
+    ".inst 0x6f80e0ab  // udot v11.4s, v5.16b, v0.4b[0]\n"
+    ".inst 0x6fa0e0b3  // udot v19.4s, v5.16b, v0.4b[1]\n"
+    ".inst 0x6f80e8b5  // udot v21.4s, v5.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+    ".inst 0x6f82e0b0  // udot v16.4s, v5.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e0bc  // udot v28.4s, v5.16b, v2.4b[1]\n"
+    ".inst 0x6f82e8b2  // udot v18.4s, v5.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e8b4  // udot v20.4s, v5.16b, v2.4b[3]\n"
+    ".inst 0x6f87e0b8  // udot v24.4s, v5.16b, v7.4b[0]\n"
+    ".inst 0x6fa7e0bf  // udot v31.4s, v5.16b, v7.4b[1]\n"
+    "mov v26.16b, v26.16b\n"
+    "mov v1.16b, v1.16b\n"
+    "mov v22.16b, v22.16b\n"
+    "mov v25.16b, v25.16b\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "movi v13.4s, #0x0\n"
+    ".inst 0x6f87e8ad  // udot v13.4s, v5.16b, v7.4b[2]\n"
+    "add v1.4s, v1.4s, v30.4s\n"
+    "movi v30.4s, #0x0\n"
+    ".inst 0x6fa7e8be  // udot v30.4s, v5.16b, v7.4b[3]\n"
+    "add v22.4s, v22.4s, v3.4s\n"
+    "add v25.4s, v25.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v11.4s\n"
+    "add v1.4s, v1.4s, v19.4s\n"
+    "add v22.4s, v22.4s, v21.4s\n"
+    "add v25.4s, v25.4s, v17.4s\n"
+    "mov v11.16b, v11.16b\n"
+    "mov v3.16b, v19.16b\n"
+    "mov v19.16b, v21.16b\n"
+    "mov v21.16b, v17.16b\n"
+    "add v11.4s, v11.4s, v16.4s\n"
+    "add v3.4s, v3.4s, v28.4s\n"
+    "add v19.4s, v19.4s, v18.4s\n"
+    "add v21.4s, v21.4s, v20.4s\n"
+    "add v11.4s, v11.4s, v24.4s\n"
+    "add v3.4s, v3.4s, v31.4s\n"
+    "add v19.4s, v19.4s, v13.4s\n"
+    "add v21.4s, v21.4s, v30.4s\n"
+    "neg v4.4s, v4.4s\n"
+    "mul v26.4s, v26.4s, v4.4s\n"
+    "str q26, [SP, #0x0]\n"
+    "mul v1.4s, v1.4s, v4.4s\n"
+    "mul v22.4s, v22.4s, v4.4s\n"
+    "str q1, [SP, #0x10]\n"
+    "mul v25.4s, v25.4s, v4.4s\n"
+    "mul v11.4s, v11.4s, v4.4s\n"
+    "str q22, [SP, #0x20]\n"
+    "mul v3.4s, v3.4s, v4.4s\n"
+    "str q25, [SP, #0x30]\n"
+    "mul v19.4s, v19.4s, v4.4s\n"
+    "mul v21.4s, v21.4s, v4.4s\n"
+    "str q11, [SP, #0x40]\n"
+    "add v26.4s, v26.4s, v6.4s\n"
+    "str q3, [SP, #0x50]\n"
+    "add v1.4s, v1.4s, v6.4s\n"
+    "str q19, [SP, #0x60]\n"
+    "add v22.4s, v22.4s, v6.4s\n"
+    "add v25.4s, v25.4s, v6.4s\n"
+    "str q21, [SP, #0x70]\n"
+    "add v11.4s, v11.4s, v6.4s\n"
+    "add v3.4s, v3.4s, v6.4s\n"
+    "add v19.4s, v19.4s, v6.4s\n"
+    "add v21.4s, v21.4s, v6.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    ".inst 0x6f8fe11a  // udot v26.4s, v8.16b, v15.4b[0]\n"
+    "ldr q20, [%x[params], #0x0]\n"
+    "add x11, x11, #0x10\n"
+    ".inst 0x6fafe101  // udot v1.4s, v8.16b, v15.4b[1]\n"
+    "ldr q4, [%x[params], #0x10]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x6f8fe916  // udot v22.4s, v8.16b, v15.4b[2]\n"
+    "ldr q6, [%x[params], #0x20]\n"
+    "cmp %x[n_channels], #0x4\n"
+    ".inst 0x6fafe919  // udot v25.4s, v8.16b, v15.4b[3]\n"
+    ".inst 0x6f80e10b  // udot v11.4s, v8.16b, v0.4b[0]\n"
+    ".inst 0x6fa0e103  // udot v3.4s, v8.16b, v0.4b[1]\n"
+    ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e915  // udot v21.4s, v8.16b, v0.4b[3]\n"
+    "ldr q8, [%x[params], #0x30]\n"
+    ".inst 0x6f9de13a  // udot v26.4s, v9.16b, v29.4b[0]\n"
+    ".inst 0x6fbde121  // udot v1.4s, v9.16b, v29.4b[1]\n"
+    ".inst 0x6f9de936  // udot v22.4s, v9.16b, v29.4b[2]\n"
+    ".inst 0x6fbde939  // udot v25.4s, v9.16b, v29.4b[3]\n"
+    ".inst 0x6f82e12b  // udot v11.4s, v9.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e123  // udot v3.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x6f82e933  // udot v19.4s, v9.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e935  // udot v21.4s, v9.16b, v2.4b[3]\n"
+    "ldr q9, [%x[params], #0x40]\n"
+    ".inst 0x6f80e15a  // udot v26.4s, v10.16b, v0.4b[0]\n"
+    ".inst 0x6fa0e141  // udot v1.4s, v10.16b, v0.4b[1]\n"
+    ".inst 0x6f80e956  // udot v22.4s, v10.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e959  // udot v25.4s, v10.16b, v0.4b[3]\n"
+    ".inst 0x6f87e14b  // udot v11.4s, v10.16b, v7.4b[0]\n"
+    ".inst 0x6fa7e143  // udot v3.4s, v10.16b, v7.4b[1]\n"
+    ".inst 0x6f87e953  // udot v19.4s, v10.16b, v7.4b[2]\n"
+    ".inst 0x6fa7e955  // udot v21.4s, v10.16b, v7.4b[3]\n"
+    "ldr q10, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x60\n"
+    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+    "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+    "and v30.16b, v26.16b, v4.16b\n"
+    "and v17.16b, v1.16b, v4.16b\n"
+    "and v16.16b, v22.16b, v4.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v30.4s\n"
+    "sqadd v1.4s, v1.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v25.16b, v4.16b\n"
+    "srshl v26.4s, v26.4s, v4.4s\n"
+    "srshl v1.4s, v1.4s, v4.4s\n"
+    "srshl v22.4s, v22.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v1.4s, v1.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v23.4s\n"
+    "smin v1.4s, v1.4s, v23.4s\n"
+    "smin v22.4s, v22.4s, v23.4s\n"
+    "smax v26.4s, v26.4s, v27.4s\n"
+    "smax v1.4s, v1.4s, v27.4s\n"
+    "smax v22.4s, v22.4s, v27.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v1.16b, v1.16b, v1.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x26, x10]\n"
+    "uzp1 v1.16b, v1.16b, v1.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "ldr q26, [SP, #0x0]\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "str s1, [x25, x10]\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "ldr q1, [SP, #0x10]\n"
+    "and v16.16b, v11.16b, v4.16b\n"
+    "str s22, [x24, x10]\n"
+    "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+    "ldr q22, [SP, #0x20]\n"
+    "srshl v25.4s, v25.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+    "and v17.16b, v3.16b, v4.16b\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "sqadd v11.4s, v11.4s, v16.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v25.4s, v25.4s, v23.4s\n"
+    "and v16.16b, v19.16b, v4.16b\n"
+    "srshl v11.4s, v11.4s, v4.4s\n"
+    "smax v25.4s, v25.4s, v27.4s\n"
+    "sqadd v3.4s, v3.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v11.4s, v11.4s, v14.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x23, x10]\n"
+    "smin v11.4s, v11.4s, v23.4s\n"
+    "srshl v3.4s, v3.4s, v4.4s\n"
+    "ldr q25, [SP, #0x30]\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "smax v11.4s, v11.4s, v27.4s\n"
+    "add v3.4s, v3.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v4.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "smin v3.4s, v3.4s, v23.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str s11, [x22, x10]\n"
+    "smax v3.4s, v3.4s, v27.4s\n"
+    "add v19.4s, v19.4s, v14.4s\n"
+    "ldr q11, [SP, #0x40]\n"
+    "and v16.16b, v21.16b, v4.16b\n"
+    "add v26.4s, v26.4s, v6.4s\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "smin v19.4s, v19.4s, v23.4s\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "str s3, [x21, x10]\n"
+    "smax v19.4s, v19.4s, v27.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr q3, [SP, #0x50]\n"
+    "add v1.4s, v1.4s, v6.4s\n"
+    "add v22.4s, v22.4s, v6.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x20, x10]\n"
+    "add v25.4s, v25.4s, v6.4s\n"
+    "add v11.4s, v11.4s, v6.4s\n"
+    "ldr q19, [SP, #0x60]\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "add v3.4s, v3.4s, v6.4s\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "add v19.4s, v19.4s, v6.4s\n"
+    "smin v21.4s, v21.4s, v23.4s\n"
+    "smax v21.4s, v21.4s, v27.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x19, x10]\n"
+    "add x10, x10, #0x4\n"
+    "ldr q21, [SP, #0x70]\n"
+    "add v21.4s, v21.4s, v6.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    ".inst 0x6f8fe11a  // udot v26.4s, v8.16b, v15.4b[0]\n"
+    "ldr q20, [%x[params], #0x0]\n"
+    "add x26, x26, x10\n"
+    ".inst 0x6fafe101  // udot v1.4s, v8.16b, v15.4b[1]\n"
+    "ldr q4, [%x[params], #0x10]\n"
+    "add x25, x25, x10\n"
+    ".inst 0x6f8fe916  // udot v22.4s, v8.16b, v15.4b[2]\n"
+    "add x24, x24, x10\n"
+    ".inst 0x6fafe919  // udot v25.4s, v8.16b, v15.4b[3]\n"
+    "add x23, x23, x10\n"
+    ".inst 0x6f80e10b  // udot v11.4s, v8.16b, v0.4b[0]\n"
+    "add x22, x22, x10\n"
+    ".inst 0x6fa0e103  // udot v3.4s, v8.16b, v0.4b[1]\n"
+    "add x21, x21, x10\n"
+    ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+    "add x20, x20, x10\n"
+    ".inst 0x6fa0e915  // udot v21.4s, v8.16b, v0.4b[3]\n"
+    "add x19, x19, x10\n"
+    ".inst 0x6f9de13a  // udot v26.4s, v9.16b, v29.4b[0]\n"
+    "cmp %x[n_channels], #0x4\n"
+    ".inst 0x6fbde121  // udot v1.4s, v9.16b, v29.4b[1]\n"
+    "add %x[params], %x[params], #0x20\n"
+    ".inst 0x6f9de936  // udot v22.4s, v9.16b, v29.4b[2]\n"
+    ".inst 0x6fbde939  // udot v25.4s, v9.16b, v29.4b[3]\n"
+    ".inst 0x6f82e12b  // udot v11.4s, v9.16b, v2.4b[0]\n"
+    ".inst 0x6fa2e123  // udot v3.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x6f82e933  // udot v19.4s, v9.16b, v2.4b[2]\n"
+    ".inst 0x6fa2e935  // udot v21.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x6f80e15a  // udot v26.4s, v10.16b, v0.4b[0]\n"
+    ".inst 0x6fa0e141  // udot v1.4s, v10.16b, v0.4b[1]\n"
+    ".inst 0x6f80e956  // udot v22.4s, v10.16b, v0.4b[2]\n"
+    ".inst 0x6fa0e959  // udot v25.4s, v10.16b, v0.4b[3]\n"
+    ".inst 0x6f87e14b  // udot v11.4s, v10.16b, v7.4b[0]\n"
+    ".inst 0x6fa7e143  // udot v3.4s, v10.16b, v7.4b[1]\n"
+    ".inst 0x6f87e953  // udot v19.4s, v10.16b, v7.4b[2]\n"
+    ".inst 0x6fa7e955  // udot v21.4s, v10.16b, v7.4b[3]\n"
+    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+    "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+    "and v30.16b, v26.16b, v4.16b\n"
+    "and v17.16b, v1.16b, v4.16b\n"
+    "and v16.16b, v22.16b, v4.16b\n"
+    "sshr v30.4s, v30.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v26.4s, v26.4s, v30.4s\n"
+    "sqadd v1.4s, v1.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v25.16b, v4.16b\n"
+    "srshl v26.4s, v26.4s, v4.4s\n"
+    "srshl v1.4s, v1.4s, v4.4s\n"
+    "srshl v22.4s, v22.4s, v4.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v26.4s, v26.4s, v14.4s\n"
+    "add v1.4s, v1.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v23.4s\n"
+    "smin v1.4s, v1.4s, v23.4s\n"
+    "smin v22.4s, v22.4s, v23.4s\n"
+    "smax v26.4s, v26.4s, v27.4s\n"
+    "smax v1.4s, v1.4s, v27.4s\n"
+    "smax v22.4s, v22.4s, v27.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v1.16b, v1.16b, v1.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v1.16b, v1.16b, v1.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+    "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v4.4s\n"
+    "sqrdmulh v19.4s, v19.4s, v20.4s\n"
+    "and v16.16b, v11.16b, v4.16b\n"
+    "and v17.16b, v3.16b, v4.16b\n"
+    "add v25.4s, v25.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v25.4s, v25.4s, v23.4s\n"
+    "sqadd v11.4s, v11.4s, v16.4s\n"
+    "sqadd v3.4s, v3.4s, v17.4s\n"
+    "smax v25.4s, v25.4s, v27.4s\n"
+    "and v16.16b, v19.16b, v4.16b\n"
+    "srshl v11.4s, v11.4s, v4.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "srshl v3.4s, v3.4s, v4.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v11.4s, v11.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v3.4s, v3.4s, v14.4s\n"
+    "smin v11.4s, v11.4s, v23.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smin v3.4s, v3.4s, v23.4s\n"
+    "smax v11.4s, v11.4s, v27.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "smax v3.4s, v3.4s, v27.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "srshl v19.4s, v19.4s, v4.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "and v16.16b, v21.16b, v4.16b\n"
+    "uzp1 v3.16b, v3.16b, v3.16b\n"
+    "add v19.4s, v19.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v19.4s, v19.4s, v23.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v27.4s\n"
+    "srshl v21.4s, v21.4s, v4.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v23.4s\n"
+    "smax v21.4s, v21.4s, v27.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "blt 3f\n"
+    "str s26, [x26, #0x0]\n"
+    "str s1, [x25, #0x0]\n"
+    "str s22, [x24, #0x0]\n"
+    "str s25, [x23, #0x0]\n"
+    "str s11, [x22, #0x0]\n"
+    "str s3, [x21, #0x0]\n"
+    "str s19, [x20, #0x0]\n"
+    "str s21, [x19, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "st1 { v26.b }[0], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v1.b }[0], [x25], #0x1\n"
+    "st1 { v22.b }[0], [x24], #0x1\n"
+    "st1 { v25.b }[0], [x23], #0x1\n"
+    "st1 { v11.b }[0], [x22], #0x1\n"
+    "st1 { v3.b }[0], [x21], #0x1\n"
+    "st1 { v19.b }[0], [x20], #0x1\n"
+    "st1 { v21.b }[0], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v26.b }[1], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v1.b }[1], [x25], #0x1\n"
+    "st1 { v22.b }[1], [x24], #0x1\n"
+    "st1 { v25.b }[1], [x23], #0x1\n"
+    "st1 { v11.b }[1], [x22], #0x1\n"
+    "st1 { v3.b }[1], [x21], #0x1\n"
+    "st1 { v19.b }[1], [x20], #0x1\n"
+    "st1 { v21.b }[1], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v26.b }[2], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v1.b }[2], [x25], #0x1\n"
+    "st1 { v22.b }[2], [x24], #0x1\n"
+    "st1 { v25.b }[2], [x23], #0x1\n"
+    "st1 { v11.b }[2], [x22], #0x1\n"
+    "st1 { v3.b }[2], [x21], #0x1\n"
+    "st1 { v19.b }[2], [x20], #0x1\n"
+    "st1 { v21.b }[2], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v26.b }[3], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v1.b }[3], [x25], #0x1\n"
+    "st1 { v22.b }[3], [x24], #0x1\n"
+    "st1 { v25.b }[3], [x23], #0x1\n"
+    "st1 { v11.b }[3], [x22], #0x1\n"
+    "st1 { v3.b }[3], [x21], #0x1\n"
+    "st1 { v19.b }[3], [x20], #0x1\n"
+    "st1 { v21.b }[3], [x19], #0x1\n"
+    "4:"  // Tail: End
+    "add SP, SP, #0x80\n"
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..c5e0417
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+{
+  typedef uint32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 8;
+  constexpr static unsigned int input_cols = 6;
+  constexpr static unsigned int input_col_quads = 1;
+
+  kern_type kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+
+  a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..8bcd682
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "movi v15.16b, #0x1\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "add SP, SP, #-0x80\n"
+    "movi v14.4s, #0x1\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "add x22, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "movi v28.4s, #0x0\n"
+    "ldr x19, [%x[inptrs], #0x10]\n"
+    "mov x11, #0x0\n"
+    "movi v27.4s, #0x0\n"
+    "ld1 { v13.16b }, [x21]\n"
+    "mov x10, #0x0\n"
+    "movi v26.4s, #0x0\n"
+    "ld1 { v12.16b }, [x20]\n"
+    "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "movi v25.4s, #0x0\n"
+    "ld1 { v7.16b }, [x19]\n"
+    "add x28, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "movi v24.4s, #0x0\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "mov v18.16b, v13.16b\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "cmp %x[n_channels], #0x4\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+    "ldr x19, [%x[inptrs], #0x28]\n"
+    "mov v17.16b, v12.16b\n"
+    "ld1 { v6.16b }, [x21]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+    "ld1 { v5.16b }, [x20]\n"
+    "mov v16.16b, v7.16b\n"
+    "ld1 { v4.16b }, [x19]\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "zip1 v13.2d, v13.2d, v18.2d\n"
+    "ldr x19, [%x[inptrs], #0x38]\n"
+    "zip1 v12.2d, v12.2d, v17.2d\n"
+    "ld1r { v3.4s }, [x22]\n"
+    "mov v18.16b, v6.16b\n"
+    "ld1 { v2.16b }, [x20]\n"
+    "zip1 v7.2d, v7.2d, v16.2d\n"
+    "ld1 { v1.16b }, [x19]\n"
+    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+    "ldp x26, x25, [%x[outptrs], #0x0]\n"
+    "mov v17.16b, v5.16b\n"
+    "ldp x24, x23, [%x[outptrs], #0x10]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+    "ldp x22, x21, [%x[outptrs], #0x20]\n"
+    "mov v16.16b, v4.16b\n"
+    "ldp x20, x19, [%x[outptrs], #0x30]\n"
+    "zip1 v6.2d, v6.2d, v18.2d\n"
+    "ld1r { v0.4s }, [x9]\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "ld1r { v31.4s }, [x28]\n"
+    "zip1 v5.2d, v5.2d, v17.2d\n"
+    "ld1r { v30.4s }, [x27]\n"
+    "mov v17.16b, v2.16b\n"
+    "ldr q29, [%x[params], #0x0]\n"
+    "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+    "ldr q8, [%x[params], #0x10]\n"
+    "zip1 v4.2d, v4.2d, v16.2d\n"
+    "ldr q9, [%x[params], #0x20]\n"
+    "mov v16.16b, v1.16b\n"
+    "ldr q10, [%x[params], #0x30]\n"
+    "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+    "ldr q11, [%x[params], #0x40]\n"
+    "add %x[params], %x[params], #0x50\n"
+    "zip1 v2.2d, v2.2d, v17.2d\n"
+    "movi v23.4s, #0x0\n"
+    "movi v22.4s, #0x0\n"
+    "zip1 v1.2d, v1.2d, v16.2d\n"
+    "movi v21.4s, #0x0\n"
+    "movi v18.4s, #0x0\n"
+    "movi v17.4s, #0x0\n"
+    "movi v16.4s, #0x0\n"
+    "movi v20.4s, #0x0\n"
+    "movi v19.4s, #0x0\n"
+    ".inst 0x6f8de1fc  // udot v28.4s, v15.16b, v13.4b[0]\n"
+    ".inst 0x6f8de9fb  // udot v27.4s, v15.16b, v13.4b[2]\n"
+    ".inst 0x6f8ce1fa  // udot v26.4s, v15.16b, v12.4b[0]\n"
+    ".inst 0x6f8ce9f9  // udot v25.4s, v15.16b, v12.4b[2]\n"
+    ".inst 0x6fade1dc  // udot v28.4s, v14.16b, v13.4b[1]\n"
+    ".inst 0x6fade9db  // udot v27.4s, v14.16b, v13.4b[3]\n"
+    ".inst 0x6face1da  // udot v26.4s, v14.16b, v12.4b[1]\n"
+    ".inst 0x6face9d9  // udot v25.4s, v14.16b, v12.4b[3]\n"
+    ".inst 0x6f87e1f8  // udot v24.4s, v15.16b, v7.4b[0]\n"
+    ".inst 0x6f87e9f7  // udot v23.4s, v15.16b, v7.4b[2]\n"
+    ".inst 0x6f86e1f6  // udot v22.4s, v15.16b, v6.4b[0]\n"
+    ".inst 0x6f86e9f5  // udot v21.4s, v15.16b, v6.4b[2]\n"
+    ".inst 0x6fa7e1d8  // udot v24.4s, v14.16b, v7.4b[1]\n"
+    ".inst 0x6fa7e9d7  // udot v23.4s, v14.16b, v7.4b[3]\n"
+    ".inst 0x6fa6e1d6  // udot v22.4s, v14.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e9d5  // udot v21.4s, v14.16b, v6.4b[3]\n"
+    ".inst 0x6f85e1f2  // udot v18.4s, v15.16b, v5.4b[0]\n"
+    ".inst 0x6f85e9f1  // udot v17.4s, v15.16b, v5.4b[2]\n"
+    ".inst 0x6f84e1f0  // udot v16.4s, v15.16b, v4.4b[0]\n"
+    ".inst 0x6f84e9f4  // udot v20.4s, v15.16b, v4.4b[2]\n"
+    ".inst 0x6fa5e1d2  // udot v18.4s, v14.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e9d1  // udot v17.4s, v14.16b, v5.4b[3]\n"
+    ".inst 0x6fa4e1d0  // udot v16.4s, v14.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e9d4  // udot v20.4s, v14.16b, v4.4b[3]\n"
+    ".inst 0x6f82e1f3  // udot v19.4s, v15.16b, v2.4b[0]\n"
+    "mov v28.16b, v28.16b\n"
+    "mov v27.16b, v27.16b\n"
+    "add v28.4s, v28.4s, v26.4s\n"
+    ".inst 0x6fa2e1d3  // udot v19.4s, v14.16b, v2.4b[1]\n"
+    "add v27.4s, v27.4s, v25.4s\n"
+    "add v28.4s, v28.4s, v24.4s\n"
+    "mov v26.16b, v26.16b\n"
+    "add v27.4s, v27.4s, v23.4s\n"
+    "add v28.4s, v28.4s, v22.4s\n"
+    "mov v25.16b, v25.16b\n"
+    "add v27.4s, v27.4s, v21.4s\n"
+    "add v28.4s, v28.4s, v18.4s\n"
+    "add v26.4s, v26.4s, v24.4s\n"
+    "add v27.4s, v27.4s, v17.4s\n"
+    "add v25.4s, v25.4s, v23.4s\n"
+    "add v26.4s, v26.4s, v22.4s\n"
+    "mov v24.16b, v24.16b\n"
+    "add v25.4s, v25.4s, v21.4s\n"
+    "add v26.4s, v26.4s, v18.4s\n"
+    "mov v23.16b, v23.16b\n"
+    "add v25.4s, v25.4s, v17.4s\n"
+    "add v26.4s, v26.4s, v16.4s\n"
+    "add v24.4s, v24.4s, v22.4s\n"
+    "add v25.4s, v25.4s, v20.4s\n"
+    "add v23.4s, v23.4s, v21.4s\n"
+    "add v24.4s, v24.4s, v18.4s\n"
+    "mov v22.16b, v22.16b\n"
+    "add v23.4s, v23.4s, v17.4s\n"
+    "add v24.4s, v24.4s, v16.4s\n"
+    "mov v21.16b, v21.16b\n"
+    "add v23.4s, v23.4s, v20.4s\n"
+    "add v24.4s, v24.4s, v19.4s\n"
+    "add v22.4s, v22.4s, v18.4s\n"
+    "movi v18.4s, #0x0\n"
+    ".inst 0x6f82e9f2  // udot v18.4s, v15.16b, v2.4b[2]\n"
+    "add v21.4s, v21.4s, v17.4s\n"
+    "movi v17.4s, #0x0\n"
+    ".inst 0x6f81e1f1  // udot v17.4s, v15.16b, v1.4b[0]\n"
+    ".inst 0x6fa2e9d2  // udot v18.4s, v14.16b, v2.4b[3]\n"
+    "add v22.4s, v22.4s, v16.4s\n"
+    "movi v16.4s, #0x0\n"
+    ".inst 0x6fa1e1d1  // udot v17.4s, v14.16b, v1.4b[1]\n"
+    ".inst 0x6f81e9f0  // udot v16.4s, v15.16b, v1.4b[2]\n"
+    "add v23.4s, v23.4s, v18.4s\n"
+    "add v21.4s, v21.4s, v20.4s\n"
+    "add v22.4s, v22.4s, v19.4s\n"
+    ".inst 0x6fa1e9d0  // udot v16.4s, v14.16b, v1.4b[3]\n"
+    "add v21.4s, v21.4s, v18.4s\n"
+    "add v22.4s, v22.4s, v17.4s\n"
+    "neg v3.4s, v3.4s\n"
+    "add v21.4s, v21.4s, v16.4s\n"
+    "mul v28.4s, v28.4s, v3.4s\n"
+    "str q28, [SP, #0x0]\n"
+    "mul v27.4s, v27.4s, v3.4s\n"
+    "mul v26.4s, v26.4s, v3.4s\n"
+    "str q27, [SP, #0x10]\n"
+    "mul v25.4s, v25.4s, v3.4s\n"
+    "mul v24.4s, v24.4s, v3.4s\n"
+    "str q26, [SP, #0x20]\n"
+    "mul v23.4s, v23.4s, v3.4s\n"
+    "str q25, [SP, #0x30]\n"
+    "mul v22.4s, v22.4s, v3.4s\n"
+    "mul v21.4s, v21.4s, v3.4s\n"
+    "str q24, [SP, #0x40]\n"
+    "add v28.4s, v28.4s, v29.4s\n"
+    "str q23, [SP, #0x50]\n"
+    "add v27.4s, v27.4s, v29.4s\n"
+    "str q22, [SP, #0x60]\n"
+    "add v26.4s, v26.4s, v29.4s\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "str q21, [SP, #0x70]\n"
+    "add v24.4s, v24.4s, v29.4s\n"
+    "add v23.4s, v23.4s, v29.4s\n"
+    "add v22.4s, v22.4s, v29.4s\n"
+    "add v21.4s, v21.4s, v29.4s\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    ".inst 0x6f8de11c  // udot v28.4s, v8.16b, v13.4b[0]\n"
+    "ldr q20, [%x[params], #0x60]\n"
+    "add x11, x11, #0x10\n"
+    ".inst 0x6f8de91b  // udot v27.4s, v8.16b, v13.4b[2]\n"
+    "ldr q19, [%x[params], #0x70]\n"
+    "sub %x[n_channels], %x[n_channels], #0x4\n"
+    ".inst 0x6f8ce11a  // udot v26.4s, v8.16b, v12.4b[0]\n"
+    "ldr q29, [%x[params], #0x80]\n"
+    "cmp %x[n_channels], #0x4\n"
+    ".inst 0x6f8ce919  // udot v25.4s, v8.16b, v12.4b[2]\n"
+    ".inst 0x6f87e118  // udot v24.4s, v8.16b, v7.4b[0]\n"
+    ".inst 0x6f87e917  // udot v23.4s, v8.16b, v7.4b[2]\n"
+    ".inst 0x6f86e116  // udot v22.4s, v8.16b, v6.4b[0]\n"
+    ".inst 0x6f86e915  // udot v21.4s, v8.16b, v6.4b[2]\n"
+    "ldr q8, [%x[params], #0x0]\n"
+    ".inst 0x6fade13c  // udot v28.4s, v9.16b, v13.4b[1]\n"
+    ".inst 0x6fade93b  // udot v27.4s, v9.16b, v13.4b[3]\n"
+    ".inst 0x6face13a  // udot v26.4s, v9.16b, v12.4b[1]\n"
+    ".inst 0x6face939  // udot v25.4s, v9.16b, v12.4b[3]\n"
+    ".inst 0x6fa7e138  // udot v24.4s, v9.16b, v7.4b[1]\n"
+    ".inst 0x6fa7e937  // udot v23.4s, v9.16b, v7.4b[3]\n"
+    ".inst 0x6fa6e136  // udot v22.4s, v9.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e935  // udot v21.4s, v9.16b, v6.4b[3]\n"
+    "ldr q9, [%x[params], #0x10]\n"
+    ".inst 0x6f8ce15c  // udot v28.4s, v10.16b, v12.4b[0]\n"
+    ".inst 0x6f8ce95b  // udot v27.4s, v10.16b, v12.4b[2]\n"
+    ".inst 0x6f87e15a  // udot v26.4s, v10.16b, v7.4b[0]\n"
+    ".inst 0x6f87e959  // udot v25.4s, v10.16b, v7.4b[2]\n"
+    ".inst 0x6f86e158  // udot v24.4s, v10.16b, v6.4b[0]\n"
+    ".inst 0x6f86e957  // udot v23.4s, v10.16b, v6.4b[2]\n"
+    ".inst 0x6f85e156  // udot v22.4s, v10.16b, v5.4b[0]\n"
+    ".inst 0x6f85e955  // udot v21.4s, v10.16b, v5.4b[2]\n"
+    "ldr q10, [%x[params], #0x20]\n"
+    ".inst 0x6face17c  // udot v28.4s, v11.16b, v12.4b[1]\n"
+    ".inst 0x6face97b  // udot v27.4s, v11.16b, v12.4b[3]\n"
+    ".inst 0x6fa7e17a  // udot v26.4s, v11.16b, v7.4b[1]\n"
+    ".inst 0x6fa7e979  // udot v25.4s, v11.16b, v7.4b[3]\n"
+    ".inst 0x6fa6e178  // udot v24.4s, v11.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e977  // udot v23.4s, v11.16b, v6.4b[3]\n"
+    ".inst 0x6fa5e176  // udot v22.4s, v11.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e975  // udot v21.4s, v11.16b, v5.4b[3]\n"
+    "ldr q11, [%x[params], #0x30]\n"
+    ".inst 0x6f87e11c  // udot v28.4s, v8.16b, v7.4b[0]\n"
+    ".inst 0x6f87e91b  // udot v27.4s, v8.16b, v7.4b[2]\n"
+    ".inst 0x6f86e11a  // udot v26.4s, v8.16b, v6.4b[0]\n"
+    ".inst 0x6f86e919  // udot v25.4s, v8.16b, v6.4b[2]\n"
+    ".inst 0x6f85e118  // udot v24.4s, v8.16b, v5.4b[0]\n"
+    ".inst 0x6f85e917  // udot v23.4s, v8.16b, v5.4b[2]\n"
+    ".inst 0x6f84e116  // udot v22.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x6f84e915  // udot v21.4s, v8.16b, v4.4b[2]\n"
+    "ldr q8, [%x[params], #0x40]\n"
+    ".inst 0x6fa7e13c  // udot v28.4s, v9.16b, v7.4b[1]\n"
+    ".inst 0x6fa7e93b  // udot v27.4s, v9.16b, v7.4b[3]\n"
+    ".inst 0x6fa6e13a  // udot v26.4s, v9.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e939  // udot v25.4s, v9.16b, v6.4b[3]\n"
+    ".inst 0x6fa5e138  // udot v24.4s, v9.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e937  // udot v23.4s, v9.16b, v5.4b[3]\n"
+    ".inst 0x6fa4e136  // udot v22.4s, v9.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e935  // udot v21.4s, v9.16b, v4.4b[3]\n"
+    "ldr q9, [%x[params], #0x50]\n"
+    ".inst 0x6f86e15c  // udot v28.4s, v10.16b, v6.4b[0]\n"
+    ".inst 0x6f86e95b  // udot v27.4s, v10.16b, v6.4b[2]\n"
+    ".inst 0x6f85e15a  // udot v26.4s, v10.16b, v5.4b[0]\n"
+    ".inst 0x6f85e959  // udot v25.4s, v10.16b, v5.4b[2]\n"
+    ".inst 0x6f84e158  // udot v24.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x6f84e957  // udot v23.4s, v10.16b, v4.4b[2]\n"
+    ".inst 0x6f82e156  // udot v22.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x6f82e955  // udot v21.4s, v10.16b, v2.4b[2]\n"
+    "ldr q10, [%x[params], #0xb0]\n"
+    ".inst 0x6fa6e17c  // udot v28.4s, v11.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e97b  // udot v27.4s, v11.16b, v6.4b[3]\n"
+    ".inst 0x6fa5e17a  // udot v26.4s, v11.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e979  // udot v25.4s, v11.16b, v5.4b[3]\n"
+    ".inst 0x6fa4e178  // udot v24.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e977  // udot v23.4s, v11.16b, v4.4b[3]\n"
+    ".inst 0x6fa2e176  // udot v22.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e975  // udot v21.4s, v11.16b, v2.4b[3]\n"
+    "ldr q11, [%x[params], #0xc0]\n"
+    ".inst 0x6f85e11c  // udot v28.4s, v8.16b, v5.4b[0]\n"
+    ".inst 0x6f85e91b  // udot v27.4s, v8.16b, v5.4b[2]\n"
+    ".inst 0x6f84e11a  // udot v26.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x6f84e919  // udot v25.4s, v8.16b, v4.4b[2]\n"
+    ".inst 0x6f82e118  // udot v24.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x6f82e917  // udot v23.4s, v8.16b, v2.4b[2]\n"
+    ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x6f81e915  // udot v21.4s, v8.16b, v1.4b[2]\n"
+    "ldr q8, [%x[params], #0x90]\n"
+    ".inst 0x6fa5e13c  // udot v28.4s, v9.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e93b  // udot v27.4s, v9.16b, v5.4b[3]\n"
+    ".inst 0x6fa4e13a  // udot v26.4s, v9.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e939  // udot v25.4s, v9.16b, v4.4b[3]\n"
+    ".inst 0x6fa2e138  // udot v24.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e937  // udot v23.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x6fa1e136  // udot v22.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e935  // udot v21.4s, v9.16b, v1.4b[3]\n"
+    "ldr q9, [%x[params], #0xa0]\n"
+    "add %x[params], %x[params], #0xd0\n"
+    "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v20.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v20.4s\n"
+    "and v18.16b, v28.16b, v19.16b\n"
+    "and v17.16b, v27.16b, v19.16b\n"
+    "and v16.16b, v26.16b, v19.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v18.4s\n"
+    "sqadd v27.4s, v27.4s, v17.4s\n"
+    "sqadd v26.4s, v26.4s, v16.4s\n"
+    "and v16.16b, v25.16b, v19.16b\n"
+    "srshl v28.4s, v28.4s, v19.4s\n"
+    "srshl v27.4s, v27.4s, v19.4s\n"
+    "srshl v26.4s, v26.4s, v19.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v28.4s, v28.4s, v0.4s\n"
+    "add v27.4s, v27.4s, v0.4s\n"
+    "add v26.4s, v26.4s, v0.4s\n"
+    "smin v28.4s, v28.4s, v30.4s\n"
+    "smin v27.4s, v27.4s, v30.4s\n"
+    "smin v26.4s, v26.4s, v30.4s\n"
+    "smax v28.4s, v28.4s, v31.4s\n"
+    "smax v27.4s, v27.4s, v31.4s\n"
+    "smax v26.4s, v26.4s, v31.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x26, x10]\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "ldr q28, [SP, #0x0]\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "str s27, [x25, x10]\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "ldr q27, [SP, #0x10]\n"
+    "and v16.16b, v24.16b, v19.16b\n"
+    "str s26, [x24, x10]\n"
+    "sqrdmulh v23.4s, v23.4s, v20.4s\n"
+    "ldr q26, [SP, #0x20]\n"
+    "srshl v25.4s, v25.4s, v19.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "and v17.16b, v23.16b, v19.16b\n"
+    "add v25.4s, v25.4s, v0.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v25.4s, v25.4s, v30.4s\n"
+    "and v16.16b, v22.16b, v19.16b\n"
+    "srshl v24.4s, v24.4s, v19.4s\n"
+    "smax v25.4s, v25.4s, v31.4s\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v0.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x23, x10]\n"
+    "smin v24.4s, v24.4s, v30.4s\n"
+    "srshl v23.4s, v23.4s, v19.4s\n"
+    "ldr q25, [SP, #0x30]\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "smax v24.4s, v24.4s, v31.4s\n"
+    "add v23.4s, v23.4s, v0.4s\n"
+    "srshl v22.4s, v22.4s, v19.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "smin v23.4s, v23.4s, v30.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x22, x10]\n"
+    "smax v23.4s, v23.4s, v31.4s\n"
+    "add v22.4s, v22.4s, v0.4s\n"
+    "ldr q24, [SP, #0x40]\n"
+    "and v16.16b, v21.16b, v19.16b\n"
+    "add v28.4s, v28.4s, v29.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smin v22.4s, v22.4s, v30.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x21, x10]\n"
+    "smax v22.4s, v22.4s, v31.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr q23, [SP, #0x50]\n"
+    "add v27.4s, v27.4s, v29.4s\n"
+    "add v26.4s, v26.4s, v29.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x20, x10]\n"
+    "add v25.4s, v25.4s, v29.4s\n"
+    "add v24.4s, v24.4s, v29.4s\n"
+    "ldr q22, [SP, #0x60]\n"
+    "srshl v21.4s, v21.4s, v19.4s\n"
+    "add v23.4s, v23.4s, v29.4s\n"
+    "add v21.4s, v21.4s, v0.4s\n"
+    "add v22.4s, v22.4s, v29.4s\n"
+    "smin v21.4s, v21.4s, v30.4s\n"
+    "smax v21.4s, v21.4s, v31.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x19, x10]\n"
+    "add x10, x10, #0x4\n"
+    "ldr q21, [SP, #0x70]\n"
+    "add v21.4s, v21.4s, v29.4s\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    ".inst 0x6f8de11c  // udot v28.4s, v8.16b, v13.4b[0]\n"
+    "ldr q20, [%x[params], #0x60]\n"
+    "add x26, x26, x10\n"
+    ".inst 0x6f8de91b  // udot v27.4s, v8.16b, v13.4b[2]\n"
+    "ldr q19, [%x[params], #0x70]\n"
+    "add x25, x25, x10\n"
+    ".inst 0x6f8ce11a  // udot v26.4s, v8.16b, v12.4b[0]\n"
+    "add x24, x24, x10\n"
+    ".inst 0x6f8ce919  // udot v25.4s, v8.16b, v12.4b[2]\n"
+    "add x23, x23, x10\n"
+    ".inst 0x6f87e118  // udot v24.4s, v8.16b, v7.4b[0]\n"
+    "add x22, x22, x10\n"
+    ".inst 0x6f87e917  // udot v23.4s, v8.16b, v7.4b[2]\n"
+    "add x21, x21, x10\n"
+    ".inst 0x6f86e116  // udot v22.4s, v8.16b, v6.4b[0]\n"
+    "add x20, x20, x10\n"
+    ".inst 0x6f86e915  // udot v21.4s, v8.16b, v6.4b[2]\n"
+    "ldr q8, [%x[params], #0x0]\n"
+    "add x19, x19, x10\n"
+    ".inst 0x6fade13c  // udot v28.4s, v9.16b, v13.4b[1]\n"
+    "cmp %x[n_channels], #0x4\n"
+    ".inst 0x6fade93b  // udot v27.4s, v9.16b, v13.4b[3]\n"
+    ".inst 0x6face13a  // udot v26.4s, v9.16b, v12.4b[1]\n"
+    ".inst 0x6face939  // udot v25.4s, v9.16b, v12.4b[3]\n"
+    ".inst 0x6fa7e138  // udot v24.4s, v9.16b, v7.4b[1]\n"
+    ".inst 0x6fa7e937  // udot v23.4s, v9.16b, v7.4b[3]\n"
+    ".inst 0x6fa6e136  // udot v22.4s, v9.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e935  // udot v21.4s, v9.16b, v6.4b[3]\n"
+    "ldr q9, [%x[params], #0x10]\n"
+    ".inst 0x6f8ce15c  // udot v28.4s, v10.16b, v12.4b[0]\n"
+    ".inst 0x6f8ce95b  // udot v27.4s, v10.16b, v12.4b[2]\n"
+    ".inst 0x6f87e15a  // udot v26.4s, v10.16b, v7.4b[0]\n"
+    ".inst 0x6f87e959  // udot v25.4s, v10.16b, v7.4b[2]\n"
+    ".inst 0x6f86e158  // udot v24.4s, v10.16b, v6.4b[0]\n"
+    ".inst 0x6f86e957  // udot v23.4s, v10.16b, v6.4b[2]\n"
+    ".inst 0x6f85e156  // udot v22.4s, v10.16b, v5.4b[0]\n"
+    ".inst 0x6f85e955  // udot v21.4s, v10.16b, v5.4b[2]\n"
+    "ldr q10, [%x[params], #0x20]\n"
+    ".inst 0x6face17c  // udot v28.4s, v11.16b, v12.4b[1]\n"
+    ".inst 0x6face97b  // udot v27.4s, v11.16b, v12.4b[3]\n"
+    ".inst 0x6fa7e17a  // udot v26.4s, v11.16b, v7.4b[1]\n"
+    ".inst 0x6fa7e979  // udot v25.4s, v11.16b, v7.4b[3]\n"
+    ".inst 0x6fa6e178  // udot v24.4s, v11.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e977  // udot v23.4s, v11.16b, v6.4b[3]\n"
+    ".inst 0x6fa5e176  // udot v22.4s, v11.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e975  // udot v21.4s, v11.16b, v5.4b[3]\n"
+    "ldr q11, [%x[params], #0x30]\n"
+    ".inst 0x6f87e11c  // udot v28.4s, v8.16b, v7.4b[0]\n"
+    ".inst 0x6f87e91b  // udot v27.4s, v8.16b, v7.4b[2]\n"
+    ".inst 0x6f86e11a  // udot v26.4s, v8.16b, v6.4b[0]\n"
+    ".inst 0x6f86e919  // udot v25.4s, v8.16b, v6.4b[2]\n"
+    ".inst 0x6f85e118  // udot v24.4s, v8.16b, v5.4b[0]\n"
+    ".inst 0x6f85e917  // udot v23.4s, v8.16b, v5.4b[2]\n"
+    ".inst 0x6f84e116  // udot v22.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x6f84e915  // udot v21.4s, v8.16b, v4.4b[2]\n"
+    "ldr q8, [%x[params], #0x40]\n"
+    ".inst 0x6fa7e13c  // udot v28.4s, v9.16b, v7.4b[1]\n"
+    ".inst 0x6fa7e93b  // udot v27.4s, v9.16b, v7.4b[3]\n"
+    ".inst 0x6fa6e13a  // udot v26.4s, v9.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e939  // udot v25.4s, v9.16b, v6.4b[3]\n"
+    ".inst 0x6fa5e138  // udot v24.4s, v9.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e937  // udot v23.4s, v9.16b, v5.4b[3]\n"
+    ".inst 0x6fa4e136  // udot v22.4s, v9.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e935  // udot v21.4s, v9.16b, v4.4b[3]\n"
+    "ldr q9, [%x[params], #0x50]\n"
+    "add %x[params], %x[params], #0x80\n"
+    ".inst 0x6f86e15c  // udot v28.4s, v10.16b, v6.4b[0]\n"
+    ".inst 0x6f86e95b  // udot v27.4s, v10.16b, v6.4b[2]\n"
+    ".inst 0x6f85e15a  // udot v26.4s, v10.16b, v5.4b[0]\n"
+    ".inst 0x6f85e959  // udot v25.4s, v10.16b, v5.4b[2]\n"
+    ".inst 0x6f84e158  // udot v24.4s, v10.16b, v4.4b[0]\n"
+    ".inst 0x6f84e957  // udot v23.4s, v10.16b, v4.4b[2]\n"
+    ".inst 0x6f82e156  // udot v22.4s, v10.16b, v2.4b[0]\n"
+    ".inst 0x6f82e955  // udot v21.4s, v10.16b, v2.4b[2]\n"
+    ".inst 0x6fa6e17c  // udot v28.4s, v11.16b, v6.4b[1]\n"
+    ".inst 0x6fa6e97b  // udot v27.4s, v11.16b, v6.4b[3]\n"
+    ".inst 0x6fa5e17a  // udot v26.4s, v11.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e979  // udot v25.4s, v11.16b, v5.4b[3]\n"
+    ".inst 0x6fa4e178  // udot v24.4s, v11.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e977  // udot v23.4s, v11.16b, v4.4b[3]\n"
+    ".inst 0x6fa2e176  // udot v22.4s, v11.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e975  // udot v21.4s, v11.16b, v2.4b[3]\n"
+    ".inst 0x6f85e11c  // udot v28.4s, v8.16b, v5.4b[0]\n"
+    ".inst 0x6f85e91b  // udot v27.4s, v8.16b, v5.4b[2]\n"
+    ".inst 0x6f84e11a  // udot v26.4s, v8.16b, v4.4b[0]\n"
+    ".inst 0x6f84e919  // udot v25.4s, v8.16b, v4.4b[2]\n"
+    ".inst 0x6f82e118  // udot v24.4s, v8.16b, v2.4b[0]\n"
+    ".inst 0x6f82e917  // udot v23.4s, v8.16b, v2.4b[2]\n"
+    ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
+    ".inst 0x6f81e915  // udot v21.4s, v8.16b, v1.4b[2]\n"
+    ".inst 0x6fa5e13c  // udot v28.4s, v9.16b, v5.4b[1]\n"
+    ".inst 0x6fa5e93b  // udot v27.4s, v9.16b, v5.4b[3]\n"
+    ".inst 0x6fa4e13a  // udot v26.4s, v9.16b, v4.4b[1]\n"
+    ".inst 0x6fa4e939  // udot v25.4s, v9.16b, v4.4b[3]\n"
+    ".inst 0x6fa2e138  // udot v24.4s, v9.16b, v2.4b[1]\n"
+    ".inst 0x6fa2e937  // udot v23.4s, v9.16b, v2.4b[3]\n"
+    ".inst 0x6fa1e136  // udot v22.4s, v9.16b, v1.4b[1]\n"
+    ".inst 0x6fa1e935  // udot v21.4s, v9.16b, v1.4b[3]\n"
+    "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v20.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+    "sqrdmulh v25.4s, v25.4s, v20.4s\n"
+    "and v18.16b, v28.16b, v19.16b\n"
+    "and v17.16b, v27.16b, v19.16b\n"
+    "and v16.16b, v26.16b, v19.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v28.4s, v28.4s, v18.4s\n"
+    "sqadd v27.4s, v27.4s, v17.4s\n"
+    "sqadd v26.4s, v26.4s, v16.4s\n"
+    "and v16.16b, v25.16b, v19.16b\n"
+    "srshl v28.4s, v28.4s, v19.4s\n"
+    "srshl v27.4s, v27.4s, v19.4s\n"
+    "srshl v26.4s, v26.4s, v19.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v28.4s, v28.4s, v0.4s\n"
+    "add v27.4s, v27.4s, v0.4s\n"
+    "add v26.4s, v26.4s, v0.4s\n"
+    "smin v28.4s, v28.4s, v30.4s\n"
+    "smin v27.4s, v27.4s, v30.4s\n"
+    "smin v26.4s, v26.4s, v30.4s\n"
+    "smax v28.4s, v28.4s, v31.4s\n"
+    "smax v27.4s, v27.4s, v31.4s\n"
+    "smax v26.4s, v26.4s, v31.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v20.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v20.4s\n"
+    "srshl v25.4s, v25.4s, v19.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+    "and v16.16b, v24.16b, v19.16b\n"
+    "and v17.16b, v23.16b, v19.16b\n"
+    "add v25.4s, v25.4s, v0.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smin v25.4s, v25.4s, v30.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "smax v25.4s, v25.4s, v31.4s\n"
+    "and v16.16b, v22.16b, v19.16b\n"
+    "srshl v24.4s, v24.4s, v19.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "srshl v23.4s, v23.4s, v19.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v0.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v23.4s, v23.4s, v0.4s\n"
+    "smin v24.4s, v24.4s, v30.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "smin v23.4s, v23.4s, v30.4s\n"
+    "smax v24.4s, v24.4s, v31.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "smax v23.4s, v23.4s, v31.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "srshl v22.4s, v22.4s, v19.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "and v16.16b, v21.16b, v19.16b\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "add v22.4s, v22.4s, v0.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v22.4s, v22.4s, v30.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v31.4s\n"
+    "srshl v21.4s, v21.4s, v19.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "add v21.4s, v21.4s, v0.4s\n"
+    "smin v21.4s, v21.4s, v30.4s\n"
+    "smax v21.4s, v21.4s, v31.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "blt 3f\n"
+    "str s28, [x26, #0x0]\n"
+    "str s27, [x25, #0x0]\n"
+    "str s26, [x24, #0x0]\n"
+    "str s25, [x23, #0x0]\n"
+    "str s24, [x22, #0x0]\n"
+    "str s23, [x21, #0x0]\n"
+    "str s22, [x20, #0x0]\n"
+    "str s21, [x19, #0x0]\n"
+    "b 4f\n"
+    "3:"  // Tail: Oddments
+    "st1 { v28.b }[0], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v27.b }[0], [x25], #0x1\n"
+    "st1 { v26.b }[0], [x24], #0x1\n"
+    "st1 { v25.b }[0], [x23], #0x1\n"
+    "st1 { v24.b }[0], [x22], #0x1\n"
+    "st1 { v23.b }[0], [x21], #0x1\n"
+    "st1 { v22.b }[0], [x20], #0x1\n"
+    "st1 { v21.b }[0], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v28.b }[1], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v27.b }[1], [x25], #0x1\n"
+    "st1 { v26.b }[1], [x24], #0x1\n"
+    "st1 { v25.b }[1], [x23], #0x1\n"
+    "st1 { v24.b }[1], [x22], #0x1\n"
+    "st1 { v23.b }[1], [x21], #0x1\n"
+    "st1 { v22.b }[1], [x20], #0x1\n"
+    "st1 { v21.b }[1], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v28.b }[2], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v27.b }[2], [x25], #0x1\n"
+    "st1 { v26.b }[2], [x24], #0x1\n"
+    "st1 { v25.b }[2], [x23], #0x1\n"
+    "st1 { v24.b }[2], [x22], #0x1\n"
+    "st1 { v23.b }[2], [x21], #0x1\n"
+    "st1 { v22.b }[2], [x20], #0x1\n"
+    "st1 { v21.b }[2], [x19], #0x1\n"
+    "beq 4f\n"
+    "st1 { v28.b }[3], [x26], #0x1\n"
+    "subs %x[n_channels], %x[n_channels], #0x1\n"
+    "st1 { v27.b }[3], [x25], #0x1\n"
+    "st1 { v26.b }[3], [x24], #0x1\n"
+    "st1 { v25.b }[3], [x23], #0x1\n"
+    "st1 { v24.b }[3], [x22], #0x1\n"
+    "st1 { v23.b }[3], [x21], #0x1\n"
+    "st1 { v22.b }[3], [x20], #0x1\n"
+    "st1 { v21.b }[3], [x19], #0x1\n"
+    "4:"  // Tail: End
+    "add SP, SP, #0x80\n"
+    : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000..6b52017
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int output_rows(void) { return 2; };
+  constexpr static unsigned int output_cols(void) { return 8; };
+
+  constexpr static unsigned int output_col_regs(void) { return 2; };
+
+  kern_type kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+  a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..ada1818
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp

@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const uint8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x9, #0x0\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v13.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v12.16b }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v11.16b }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v10.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v9.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v8.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v7.4s }, [x19]\n"
+    "lsr x28, %x[n_output_channels], #0x2\n"
+    "cbz x28, 9f\n"
+    "1:"  // Output channel loop
+    "movi v16.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x19, x9, #0x2\n"
+    "ldr q16, [%x[bias], x19]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v6.16b, v16.16b\n"
+    "mov v5.16b, v16.16b\n"
+    "mov v4.16b, v16.16b\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v16.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "mov v28.16b, v16.16b\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "mov v20.16b, v16.16b\n"
+    "mov v19.16b, v16.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x19, x9, #0x2\n"
+    "ldr q8, [%x[rq_mul_ptr], x19]\n"
+    "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s17, [%x[weights]], #0x4\n"
+    "usubl v17.8h, v17.8b, v11.8b\n"
+    "mov x19, %x[inptrs]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "ldr d3, [x25, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "cbz x20, 7f\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "usubl v16.8h, v16.8b, v11.8b\n"
+    "ldr d1, [x25, #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "usubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d0, [x27, #0x0]\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "usubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "ldr d1, [x25, #0x0]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "ldr d0, [x27, #0x0]\n"
+    "usubl v1.8h, v1.8b, v12.8b\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "usubl v16.8h, v16.8b, v11.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "usubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x28, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v16.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x19, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v16.d }[0], [x19], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v16.s }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v16.s }[0], [x19]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v6.16b, v16.16b\n"
+    "mov v5.16b, v16.16b\n"
+    "mov v4.16b, v16.16b\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v16.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "mov v28.16b, v16.16b\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "mov v20.16b, v16.16b\n"
+    "mov v19.16b, v16.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "ld1 { v9.d }[0], [x19], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v7.s }[2], [x20], #0x4\n"
+    "ld1 { v9.s }[2], [x19], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v7.s }[0], [x20], #0x4\n"
+    "ld1 { v9.s }[0], [x19], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v7.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v7.s }[0], [x20], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s17, [%x[weights]], #0x4\n"
+    "usubl v17.8h, v17.8b, v11.8b\n"
+    "mov x19, %x[inptrs]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "ldr d3, [x25, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "cbz x20, 22f\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "usubl v16.8h, v16.8b, v11.8b\n"
+    "ldr d1, [x25, #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "usubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d0, [x27, #0x0]\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "usubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "ldr d1, [x25, #0x0]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "ldr d0, [x27, #0x0]\n"
+    "usubl v1.8h, v1.8b, v12.8b\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "usubl v16.8h, v16.8b, v11.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "usubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.h }[0], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.h }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.h }[0], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.h }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.h }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.h }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.h }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.h }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.h }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.h }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.h }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.h }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.h }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.h }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.h }[0], [x25]\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v19.h }[0], [x26]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.b }[2], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.b }[2], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.b }[2], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.b }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.b }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.b }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.b }[2], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.b }[2], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.b }[2], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.b }[2], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.b }[2], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.b }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.b }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.b }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.b }[2], [x25]\n"
+    "st1 { v19.b }[2], [x26]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.b }[0], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.b }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.b }[0], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.b }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.b }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.b }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.b }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.b }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.b }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.b }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.b }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.b }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.b }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.b }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.b }[0], [x25]\n"
+    "st1 { v19.b }[0], [x26]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+
+    "26:"  // Done
+
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..1bacb5f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef int8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+  a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..8cbbfae
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,1192 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x17, #0x0\n"
+    "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x15, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x12, x8, #0x3\n"
+    "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v21.16b }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v17.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v13.4s }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v15.4s }, [x20]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "ldp x10, x9, [x21, #0x0]\n"
+    "ldp x28, x27, [x21, #0x10]\n"
+    "cbz x12, 3f\n"
+    "subs x12, x12, #0x1\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q11, [x19, #0x0]\n"
+    "mov v23.16b, v11.16b\n"
+    "ldr q26, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v12.16b, v11.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v24.16b, v11.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v9.16b, v26.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "mov v22.16b, v26.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "mov v10.16b, v26.16b\n"
+    "ldr d4, [x16, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v17.8b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "ssubl v1.8h, v1.8b, v17.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "ssubl v2.8h, v2.8b, v17.8b\n"
+    "ldr d7, [x16, #0x38]\n"
+    "ssubl v3.8h, v3.8b, v17.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "ssubl v4.8h, v4.8b, v17.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "ssubl v5.8h, v5.8b, v17.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "ssubl v6.8h, v6.8b, v17.8b\n"
+    "ssubl v7.8h, v7.8b, v17.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "ssubl v8.8h, v8.8b, v17.8b\n"
+    "ldr d31, [x23, x17]\n"
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "ldr d30, [x22, x17]\n"
+    "ldr d29, [x21, x17]\n"
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "ldr d28, [x20, x17]\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "ldr d27, [x19, x17]\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "usubl v27.8h, v27.8b, v21.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v11.4s, v31.4h, v4.4h\n"
+    "ldr x21, [x14, #0x28]\n"
+    "add x16, x16, #0x48\n"
+    "smlal2 v26.4s, v31.8h, v4.8h\n"
+    "ldr x20, [x14, #0x30]\n"
+    "subs x12, x12, #0x1\n"
+    "smlal v23.4s, v31.4h, v3.4h\n"
+    "ldr x26, [x14, #0x38]\n"
+    "smlal2 v9.4s, v31.8h, v3.8h\n"
+    "ldr x25, [x14, #0x40]\n"
+    "smlal v12.4s, v31.4h, v1.4h\n"
+    "ldr x19, [x14, #0x48]\n"
+    "smlal2 v22.4s, v31.8h, v1.8h\n"
+    "ldr x24, [x14, #0x50]\n"
+    "smlal v24.4s, v31.4h, v0.4h\n"
+    "ldr x23, [x14, #0x58]\n"
+    "smlal2 v10.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x21, x17]\n"
+    "smlal v11.4s, v30.4h, v0.4h\n"
+    "ldr x22, [x14, #0x60]\n"
+    "smlal2 v26.4s, v30.8h, v0.8h\n"
+    "ldr d30, [x19, x17]\n"
+    "smlal v23.4s, v29.4h, v2.4h\n"
+    "ldr x21, [x14, #0x68]\n"
+    "smlal2 v9.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "smlal v11.4s, v28.4h, v5.4h\n"
+    "ldr x20, [x14, #0x70]\n"
+    "smlal2 v26.4s, v28.8h, v5.8h\n"
+    "ldr x19, [x14, #0x78]\n"
+    "smlal v23.4s, v28.4h, v4.4h\n"
+    "ldr q25, [x13, #0x0]\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "ldr q18, [x11, #0x0]\n"
+    "smlal v12.4s, v28.4h, v2.4h\n"
+    "ldr q16, [x13, #0x10]\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "ldr q20, [x11, #0x10]\n"
+    "add x11, x11, #0x20\n"
+    "smlal v24.4s, v28.4h, v1.4h\n"
+    "smlal2 v10.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x26, x17]\n"
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "smlal v11.4s, v27.4h, v7.4h\n"
+    "smlal2 v26.4s, v27.8h, v7.8h\n"
+    "smlal v12.4s, v31.4h, v6.4h\n"
+    "smlal2 v22.4s, v31.8h, v6.8h\n"
+    "ldr d31, [x25, x17]\n"
+    "smlal v23.4s, v27.4h, v6.4h\n"
+    "smlal2 v9.4s, v27.8h, v6.8h\n"
+    "smlal v12.4s, v27.4h, v4.4h\n"
+    "smlal2 v22.4s, v27.8h, v4.8h\n"
+    "smlal v24.4s, v27.4h, v3.4h\n"
+    "smlal2 v10.4s, v27.8h, v3.8h\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "smlal v24.4s, v29.4h, v8.4h\n"
+    "smlal2 v10.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x24, x17]\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "smlal2 v26.4s, v28.8h, v1.8h\n"
+    "smlal v23.4s, v28.4h, v0.4h\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "ldr d28, [x23, x17]\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v26.4s, v31.8h, v2.8h\n"
+    "smlal v23.4s, v31.4h, v1.4h\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "ldr d31, [x22, x17]\n"
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "smlal v11.4s, v30.4h, v8.4h\n"
+    "smlal2 v26.4s, v30.8h, v8.8h\n"
+    "smlal v23.4s, v30.4h, v7.4h\n"
+    "smlal2 v9.4s, v30.8h, v7.8h\n"
+    "smlal v12.4s, v30.4h, v5.4h\n"
+    "smlal2 v22.4s, v30.8h, v5.8h\n"
+    "smlal v24.4s, v30.4h, v4.4h\n"
+    "smlal2 v10.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x17]\n"
+    "smlal v11.4s, v29.4h, v3.4h\n"
+    "smlal2 v26.4s, v29.8h, v3.8h\n"
+    "smlal v12.4s, v29.4h, v0.4h\n"
+    "smlal2 v22.4s, v29.8h, v0.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "smlal v23.4s, v28.4h, v5.4h\n"
+    "smlal2 v9.4s, v28.8h, v5.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v10.4s, v28.8h, v2.8h\n"
+    "ldr d28, [x19, x17]\n"
+    "add x17, x17, #0x8\n"
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "smlal v11.4s, v31.4h, v6.4h\n"
+    "smlal2 v26.4s, v31.8h, v6.8h\n"
+    "smlal v12.4s, v31.4h, v3.4h\n"
+    "smlal2 v22.4s, v31.8h, v3.8h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "smlal2 v9.4s, v30.8h, v8.8h\n"
+    "smlal v24.4s, v30.4h, v5.4h\n"
+    "smlal2 v10.4s, v30.8h, v5.8h\n"
+    "smlal v12.4s, v29.4h, v7.4h\n"
+    "smlal2 v22.4s, v29.8h, v7.8h\n"
+    "smlal v24.4s, v29.4h, v6.4h\n"
+    "smlal2 v10.4s, v29.8h, v6.8h\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "sqrdmulh v11.4s, v11.4s, v25.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v16.4s\n"
+    "smlal v12.4s, v28.4h, v8.4h\n"
+    "smlal2 v22.4s, v28.8h, v8.8h\n"
+    "smlal v24.4s, v28.4h, v7.4h\n"
+    "smlal2 v10.4s, v28.8h, v7.8h\n"
+    "and v19.16b, v11.16b, v18.16b\n"
+    "and v5.16b, v26.16b, v20.16b\n"
+    "sqrdmulh v23.4s, v23.4s, v25.4s\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sqrdmulh v9.4s, v9.4s, v16.4s\n"
+    "sqadd v11.4s, v11.4s, v19.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "and v28.16b, v23.16b, v18.16b\n"
+    "and v8.16b, v9.16b, v20.16b\n"
+    "srshl v11.4s, v11.4s, v18.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "add v11.4s, v11.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v28.4s\n"
+    "smin v11.4s, v11.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "sqadd v9.4s, v9.4s, v8.4s\n"
+    "smax v11.4s, v11.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "srshl v23.4s, v23.4s, v18.4s\n"
+    "srshl v9.4s, v9.4s, v20.4s\n"
+    "uzp1 v11.16b, v11.16b, v26.16b\n"
+    "sqrdmulh v12.4s, v12.4s, v25.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str d11, [x10, x15]\n"
+    "add v23.4s, v23.4s, v13.4s\n"
+    "add v9.4s, v9.4s, v13.4s\n"
+    "and v1.16b, v12.16b, v18.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v16.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smin v9.4s, v9.4s, v14.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smax v9.4s, v9.4s, v15.4s\n"
+    "sqadd v12.4s, v12.4s, v1.4s\n"
+    "and v0.16b, v22.16b, v20.16b\n"
+    "uzp1 v23.16b, v23.16b, v9.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v25.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d23, [x9, x15]\n"
+    "srshl v12.4s, v12.4s, v18.4s\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "and v26.16b, v24.16b, v18.16b\n"
+    "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+    "sqadd v22.4s, v22.4s, v0.4s\n"
+    "add v12.4s, v12.4s, v13.4s\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "and v16.16b, v10.16b, v20.16b\n"
+    "smin v12.4s, v12.4s, v14.4s\n"
+    "srshl v22.4s, v22.4s, v20.4s\n"
+    "sqadd v24.4s, v24.4s, v26.4s\n"
+    "smax v12.4s, v12.4s, v15.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v24.4s, v24.4s, v18.4s\n"
+    "sqadd v10.4s, v10.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "srshl v10.4s, v10.4s, v20.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "uzp1 v12.16b, v12.16b, v22.16b\n"
+    "add v10.4s, v10.4s, v13.4s\n"
+    "uzp1 v12.16b, v12.16b, v12.16b\n"
+    "str d12, [x28, x15]\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "smax v10.4s, v10.4s, v15.4s\n"
+    "uzp1 v24.16b, v24.16b, v10.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d24, [x27, x15]\n"
+    "add x15, x15, #0x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q11, [x19, #0x0]\n"
+    "mov v23.16b, v11.16b\n"
+    "ldr q26, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v12.16b, v11.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v24.16b, v11.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v9.16b, v26.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "mov v22.16b, v26.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "mov v10.16b, v26.16b\n"
+    "ldr d4, [x16, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v17.8b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "ssubl v1.8h, v1.8b, v17.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "ssubl v2.8h, v2.8b, v17.8b\n"
+    "ldr d7, [x16, #0x38]\n"
+    "ssubl v3.8h, v3.8b, v17.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "ssubl v4.8h, v4.8b, v17.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "ssubl v5.8h, v5.8b, v17.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "ssubl v6.8h, v6.8b, v17.8b\n"
+    "ssubl v7.8h, v7.8b, v17.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "ssubl v8.8h, v8.8b, v17.8b\n"
+    "ldr d31, [x23, x17]\n"
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "ldr d30, [x22, x17]\n"
+    "ldr d29, [x21, x17]\n"
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "ldr d28, [x20, x17]\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "ldr d27, [x19, x17]\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "usubl v27.8h, v27.8b, v21.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v11.4s, v31.4h, v4.4h\n"
+    "ldr x21, [x14, #0x28]\n"
+    "tst x8, #0x7\n"
+    "smlal2 v26.4s, v31.8h, v4.8h\n"
+    "ldr x20, [x14, #0x30]\n"
+    "smlal v23.4s, v31.4h, v3.4h\n"
+    "ldr x26, [x14, #0x38]\n"
+    "smlal2 v9.4s, v31.8h, v3.8h\n"
+    "ldr x25, [x14, #0x40]\n"
+    "smlal v12.4s, v31.4h, v1.4h\n"
+    "ldr x19, [x14, #0x48]\n"
+    "smlal2 v22.4s, v31.8h, v1.8h\n"
+    "ldr x24, [x14, #0x50]\n"
+    "smlal v24.4s, v31.4h, v0.4h\n"
+    "ldr x23, [x14, #0x58]\n"
+    "smlal2 v10.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x21, x17]\n"
+    "smlal v11.4s, v30.4h, v0.4h\n"
+    "ldr x22, [x14, #0x60]\n"
+    "smlal2 v26.4s, v30.8h, v0.8h\n"
+    "ldr d30, [x19, x17]\n"
+    "smlal v23.4s, v29.4h, v2.4h\n"
+    "ldr x21, [x14, #0x68]\n"
+    "smlal2 v9.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "smlal v11.4s, v28.4h, v5.4h\n"
+    "ldr x20, [x14, #0x70]\n"
+    "smlal2 v26.4s, v28.8h, v5.8h\n"
+    "ldr x19, [x14, #0x78]\n"
+    "smlal v23.4s, v28.4h, v4.4h\n"
+    "ldr q25, [x13, #0x0]\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "ldr q18, [x11, #0x0]\n"
+    "smlal v12.4s, v28.4h, v2.4h\n"
+    "ldr q16, [x13, #0x10]\n"
+    "add x13, x13, #0x20\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "ldr q20, [x11, #0x10]\n"
+    "add x11, x11, #0x20\n"
+    "smlal v24.4s, v28.4h, v1.4h\n"
+    "smlal2 v10.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x26, x17]\n"
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "smlal v11.4s, v27.4h, v7.4h\n"
+    "smlal2 v26.4s, v27.8h, v7.8h\n"
+    "smlal v12.4s, v31.4h, v6.4h\n"
+    "smlal2 v22.4s, v31.8h, v6.8h\n"
+    "ldr d31, [x25, x17]\n"
+    "smlal v23.4s, v27.4h, v6.4h\n"
+    "smlal2 v9.4s, v27.8h, v6.8h\n"
+    "smlal v12.4s, v27.4h, v4.4h\n"
+    "smlal2 v22.4s, v27.8h, v4.8h\n"
+    "smlal v24.4s, v27.4h, v3.4h\n"
+    "smlal2 v10.4s, v27.8h, v3.8h\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "smlal v24.4s, v29.4h, v8.4h\n"
+    "smlal2 v10.4s, v29.8h, v8.8h\n"
+    "ldr d29, [x24, x17]\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "smlal2 v26.4s, v28.8h, v1.8h\n"
+    "smlal v23.4s, v28.4h, v0.4h\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "ldr d28, [x23, x17]\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v26.4s, v31.8h, v2.8h\n"
+    "smlal v23.4s, v31.4h, v1.4h\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "ldr d31, [x22, x17]\n"
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "smlal v11.4s, v30.4h, v8.4h\n"
+    "smlal2 v26.4s, v30.8h, v8.8h\n"
+    "smlal v23.4s, v30.4h, v7.4h\n"
+    "smlal2 v9.4s, v30.8h, v7.8h\n"
+    "smlal v12.4s, v30.4h, v5.4h\n"
+    "smlal2 v22.4s, v30.8h, v5.8h\n"
+    "smlal v24.4s, v30.4h, v4.4h\n"
+    "smlal2 v10.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x17]\n"
+    "smlal v11.4s, v29.4h, v3.4h\n"
+    "smlal2 v26.4s, v29.8h, v3.8h\n"
+    "smlal v12.4s, v29.4h, v0.4h\n"
+    "smlal2 v22.4s, v29.8h, v0.8h\n"
+    "ldr d29, [x20, x17]\n"
+    "smlal v23.4s, v28.4h, v5.4h\n"
+    "smlal2 v9.4s, v28.8h, v5.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v10.4s, v28.8h, v2.8h\n"
+    "ldr d28, [x19, x17]\n"
+    "add x17, x17, #0x8\n"
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "smlal v11.4s, v31.4h, v6.4h\n"
+    "smlal2 v26.4s, v31.8h, v6.8h\n"
+    "smlal v12.4s, v31.4h, v3.4h\n"
+    "smlal2 v22.4s, v31.8h, v3.8h\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "smlal2 v9.4s, v30.8h, v8.8h\n"
+    "smlal v24.4s, v30.4h, v5.4h\n"
+    "smlal2 v10.4s, v30.8h, v5.8h\n"
+    "smlal v12.4s, v29.4h, v7.4h\n"
+    "smlal2 v22.4s, v29.8h, v7.8h\n"
+    "smlal v24.4s, v29.4h, v6.4h\n"
+    "smlal2 v10.4s, v29.8h, v6.8h\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "sqrdmulh v11.4s, v11.4s, v25.4s\n"
+    "sqrdmulh v26.4s, v26.4s, v16.4s\n"
+    "smlal v12.4s, v28.4h, v8.4h\n"
+    "smlal2 v22.4s, v28.8h, v8.8h\n"
+    "smlal v24.4s, v28.4h, v7.4h\n"
+    "smlal2 v10.4s, v28.8h, v7.8h\n"
+    "and v19.16b, v11.16b, v18.16b\n"
+    "and v5.16b, v26.16b, v20.16b\n"
+    "sqrdmulh v23.4s, v23.4s, v25.4s\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sqrdmulh v9.4s, v9.4s, v16.4s\n"
+    "sqadd v11.4s, v11.4s, v19.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "and v28.16b, v23.16b, v18.16b\n"
+    "and v8.16b, v9.16b, v20.16b\n"
+    "srshl v11.4s, v11.4s, v18.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "add v11.4s, v11.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v28.4s\n"
+    "smin v11.4s, v11.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "sqadd v9.4s, v9.4s, v8.4s\n"
+    "smax v11.4s, v11.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "srshl v23.4s, v23.4s, v18.4s\n"
+    "srshl v9.4s, v9.4s, v20.4s\n"
+    "uzp1 v11.16b, v11.16b, v26.16b\n"
+    "sqrdmulh v12.4s, v12.4s, v25.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str d11, [x10, x15]\n"
+    "add v23.4s, v23.4s, v13.4s\n"
+    "add v9.4s, v9.4s, v13.4s\n"
+    "and v1.16b, v12.16b, v18.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v16.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smin v9.4s, v9.4s, v14.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "smax v9.4s, v9.4s, v15.4s\n"
+    "sqadd v12.4s, v12.4s, v1.4s\n"
+    "and v0.16b, v22.16b, v20.16b\n"
+    "uzp1 v23.16b, v23.16b, v9.16b\n"
+    "sqrdmulh v24.4s, v24.4s, v25.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str d23, [x9, x15]\n"
+    "srshl v12.4s, v12.4s, v18.4s\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "and v26.16b, v24.16b, v18.16b\n"
+    "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+    "sqadd v22.4s, v22.4s, v0.4s\n"
+    "add v12.4s, v12.4s, v13.4s\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "and v16.16b, v10.16b, v20.16b\n"
+    "smin v12.4s, v12.4s, v14.4s\n"
+    "srshl v22.4s, v22.4s, v20.4s\n"
+    "sqadd v24.4s, v24.4s, v26.4s\n"
+    "smax v12.4s, v12.4s, v15.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "srshl v24.4s, v24.4s, v18.4s\n"
+    "sqadd v10.4s, v10.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "srshl v10.4s, v10.4s, v20.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "uzp1 v12.16b, v12.16b, v22.16b\n"
+    "add v10.4s, v10.4s, v13.4s\n"
+    "uzp1 v12.16b, v12.16b, v12.16b\n"
+    "str d12, [x28, x15]\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "smax v10.4s, v10.4s, v15.4s\n"
+    "uzp1 v24.16b, v24.16b, v10.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str d24, [x27, x15]\n"
+    "add x15, x15, #0x8\n"
+    "beq 64f\n"
+    "add x16, x16, #0x48\n"
+    "3:"  // Oddments
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x8, #2, 5f\n"
+    "ld1 { v11.4s }, [x19], #0x10\n"
+    "tbz x8, #1, 4f\n"
+    "ld1 { v26.d }[0], [x19], #0x8\n"
+    "tbz x8, #0, 7f\n"
+    "ld1 { v26.s }[2], [x19]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 7f\n"
+    "ld1 { v26.s }[0], [x19]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x8, #1, 6f\n"
+    "ld1 { v11.d }[0], [x19], #0x8\n"
+    "tbz x8, #0, 7f\n"
+    "ld1 { v11.s }[2], [x19]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 7f\n"
+    "ld1 { v11.s }[0], [x19]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v23.16b, v11.16b\n"
+    "ldr d0, [x16, #0x0]\n"
+    "mov v9.16b, v26.16b\n"
+    "ldr d1, [x16, #0x8]\n"
+    "mov v12.16b, v11.16b\n"
+    "ldr d2, [x16, #0x10]\n"
+    "mov v22.16b, v26.16b\n"
+    "ldr d3, [x16, #0x18]\n"
+    "mov v24.16b, v11.16b\n"
+    "ldr d4, [x16, #0x20]\n"
+    "mov v10.16b, v26.16b\n"
+    "ldr d5, [x16, #0x28]\n"
+    "ssubl v0.8h, v0.8b, v17.8b\n"
+    "ldr d6, [x16, #0x30]\n"
+    "ssubl v1.8h, v1.8b, v17.8b\n"
+    "ldr d7, [x16, #0x38]\n"
+    "ssubl v2.8h, v2.8b, v17.8b\n"
+    "ldr d8, [x16, #0x40]\n"
+    "ssubl v3.8h, v3.8b, v17.8b\n"
+    "ldp x23, x22, [x14, #0x0]\n"
+    "add x23, x23, x17\n"
+    "ssubl v4.8h, v4.8b, v17.8b\n"
+    "ldp x21, x20, [x14, #0x10]\n"
+    "ssubl v5.8h, v5.8b, v17.8b\n"
+    "ldr x19, [x14, #0x20]\n"
+    "ssubl v6.8h, v6.8b, v17.8b\n"
+    "add x22, x22, x17\n"
+    "ssubl v7.8h, v7.8b, v17.8b\n"
+    "add x21, x21, x17\n"
+    "ssubl v8.8h, v8.8b, v17.8b\n"
+    "add x20, x20, x17\n"
+    "add x19, x19, x17\n"
+    "tbz x8, #2, 9f\n"
+    "ld1 { v31.s }[0], [x23], #0x4\n"
+    "ld1 { v30.s }[0], [x22], #0x4\n"
+    "ld1 { v29.s }[0], [x21], #0x4\n"
+    "ld1 { v28.s }[0], [x20], #0x4\n"
+    "ld1 { v27.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 8f\n"
+    "ld1 { v31.h }[2], [x23], #0x2\n"
+    "ld1 { v30.h }[2], [x22], #0x2\n"
+    "ld1 { v29.h }[2], [x21], #0x2\n"
+    "ld1 { v28.h }[2], [x20], #0x2\n"
+    "ld1 { v27.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[6], [x23]\n"
+    "ld1 { v30.b }[6], [x22]\n"
+    "ld1 { v29.b }[6], [x21]\n"
+    "ld1 { v28.b }[6], [x20]\n"
+    "ld1 { v27.b }[6], [x19]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[4], [x23]\n"
+    "ld1 { v30.b }[4], [x22]\n"
+    "ld1 { v29.b }[4], [x21]\n"
+    "ld1 { v28.b }[4], [x20]\n"
+    "ld1 { v27.b }[4], [x19]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x8, #1, 10f\n"
+    "ld1 { v31.h }[0], [x23], #0x2\n"
+    "ld1 { v30.h }[0], [x22], #0x2\n"
+    "ld1 { v29.h }[0], [x21], #0x2\n"
+    "ld1 { v28.h }[0], [x20], #0x2\n"
+    "ld1 { v27.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[2], [x23]\n"
+    "ld1 { v30.b }[2], [x22]\n"
+    "ld1 { v29.b }[2], [x21]\n"
+    "ld1 { v28.b }[2], [x20]\n"
+    "ld1 { v27.b }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 11f\n"
+    "ld1 { v31.b }[0], [x23]\n"
+    "ld1 { v30.b }[0], [x22]\n"
+    "ld1 { v29.b }[0], [x21]\n"
+    "ld1 { v28.b }[0], [x20]\n"
+    "ld1 { v27.b }[0], [x19]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "ldr x21, [x14, #0x28]\n"
+    "add x21, x21, x17\n"
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "usubl v27.8h, v27.8b, v21.8b\n"
+    "smlal v11.4s, v31.4h, v4.4h\n"
+    "smlal2 v26.4s, v31.8h, v4.8h\n"
+    "smlal v23.4s, v31.4h, v3.4h\n"
+    "smlal2 v9.4s, v31.8h, v3.8h\n"
+    "smlal v12.4s, v31.4h, v1.4h\n"
+    "smlal2 v22.4s, v31.8h, v1.8h\n"
+    "smlal v24.4s, v31.4h, v0.4h\n"
+    "smlal2 v10.4s, v31.8h, v0.8h\n"
+    "smlal v11.4s, v30.4h, v0.4h\n"
+    "smlal2 v26.4s, v30.8h, v0.8h\n"
+    "smlal v23.4s, v29.4h, v2.4h\n"
+    "smlal2 v9.4s, v29.8h, v2.8h\n"
+    "smlal v11.4s, v28.4h, v5.4h\n"
+    "smlal2 v26.4s, v28.8h, v5.8h\n"
+    "smlal v23.4s, v28.4h, v4.4h\n"
+    "smlal2 v9.4s, v28.8h, v4.8h\n"
+    "smlal v12.4s, v28.4h, v2.4h\n"
+    "smlal2 v22.4s, v28.8h, v2.8h\n"
+    "smlal v24.4s, v28.4h, v1.4h\n"
+    "smlal2 v10.4s, v28.8h, v1.8h\n"
+    "tbz x8, #2, 13f\n"
+    "ld1 { v31.s }[0], [x21], #0x4\n"
+    "tbz x8, #1, 12f\n"
+    "ld1 { v31.h }[2], [x21], #0x2\n"
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[6], [x21]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[4], [x21]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x8, #1, 14f\n"
+    "ld1 { v31.h }[0], [x21], #0x2\n"
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[2], [x21]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 15f\n"
+    "ld1 { v31.b }[0], [x21]\n"
+    "15:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "ldr x20, [x14, #0x30]\n"
+    "smlal v11.4s, v27.4h, v7.4h\n"
+    "add x20, x20, x17\n"
+    "smlal v12.4s, v31.4h, v6.4h\n"
+    "smlal2 v22.4s, v31.8h, v6.8h\n"
+    "smlal2 v26.4s, v27.8h, v7.8h\n"
+    "smlal v23.4s, v27.4h, v6.4h\n"
+    "smlal2 v9.4s, v27.8h, v6.8h\n"
+    "smlal v12.4s, v27.4h, v4.4h\n"
+    "smlal2 v22.4s, v27.8h, v4.8h\n"
+    "smlal v24.4s, v27.4h, v3.4h\n"
+    "smlal2 v10.4s, v27.8h, v3.8h\n"
+    "tbz x8, #2, 17f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x8, #1, 16f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x8, #1, 18f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 19f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "19:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "ldr x26, [x14, #0x38]\n"
+    "smlal v24.4s, v29.4h, v8.4h\n"
+    "add x26, x26, x17\n"
+    "smlal2 v10.4s, v29.8h, v8.8h\n"
+    "tbz x8, #2, 21f\n"
+    "ld1 { v28.s }[0], [x26], #0x4\n"
+    "tbz x8, #1, 20f\n"
+    "ld1 { v28.h }[2], [x26], #0x2\n"
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[6], [x26]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[4], [x26]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 1): Bit 2: Unset
+    "tbz x8, #1, 22f\n"
+    "ld1 { v28.h }[0], [x26], #0x2\n"
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[2], [x26]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 23f\n"
+    "ld1 { v28.b }[0], [x26]\n"
+    "23:"  // Oddments: Load (0, 1): Bit 2: End
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "ldr x25, [x14, #0x40]\n"
+    "smlal v11.4s, v28.4h, v1.4h\n"
+    "add x25, x25, x17\n"
+    "smlal2 v26.4s, v28.8h, v1.8h\n"
+    "smlal v23.4s, v28.4h, v0.4h\n"
+    "smlal2 v9.4s, v28.8h, v0.8h\n"
+    "tbz x8, #2, 25f\n"
+    "ld1 { v31.s }[0], [x25], #0x4\n"
+    "tbz x8, #1, 24f\n"
+    "ld1 { v31.h }[2], [x25], #0x2\n"
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[6], [x25]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[4], [x25]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (0, 2): Bit 2: Unset
+    "tbz x8, #1, 26f\n"
+    "ld1 { v31.h }[0], [x25], #0x2\n"
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[2], [x25]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 27f\n"
+    "ld1 { v31.b }[0], [x25]\n"
+    "27:"  // Oddments: Load (0, 2): Bit 2: End
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "ldr x19, [x14, #0x48]\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "add x19, x19, x17\n"
+    "smlal2 v26.4s, v31.8h, v2.8h\n"
+    "smlal v23.4s, v31.4h, v1.4h\n"
+    "smlal2 v9.4s, v31.8h, v1.8h\n"
+    "tbz x8, #2, 29f\n"
+    "ld1 { v30.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 28f\n"
+    "ld1 { v30.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[6], [x19]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[4], [x19]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x8, #1, 30f\n"
+    "ld1 { v30.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[2], [x19]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 31f\n"
+    "ld1 { v30.b }[0], [x19]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "ldr x24, [x14, #0x50]\n"
+    "smlal v11.4s, v30.4h, v8.4h\n"
+    "add x24, x24, x17\n"
+    "smlal2 v26.4s, v30.8h, v8.8h\n"
+    "smlal v23.4s, v30.4h, v7.4h\n"
+    "smlal2 v9.4s, v30.8h, v7.8h\n"
+    "smlal v12.4s, v30.4h, v5.4h\n"
+    "smlal2 v22.4s, v30.8h, v5.8h\n"
+    "smlal v24.4s, v30.4h, v4.4h\n"
+    "smlal2 v10.4s, v30.8h, v4.8h\n"
+    "tbz x8, #2, 33f\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "tbz x8, #1, 32f\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (1, 0): Bit 2: Unset
+    "tbz x8, #1, 34f\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 35f\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "35:"  // Oddments: Load (1, 0): Bit 2: End
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "ldr x23, [x14, #0x58]\n"
+    "smlal v11.4s, v29.4h, v3.4h\n"
+    "add x23, x23, x17\n"
+    "smlal2 v26.4s, v29.8h, v3.8h\n"
+    "smlal v12.4s, v29.4h, v0.4h\n"
+    "smlal2 v22.4s, v29.8h, v0.8h\n"
+    "tbz x8, #2, 37f\n"
+    "ld1 { v28.s }[0], [x23], #0x4\n"
+    "tbz x8, #1, 36f\n"
+    "ld1 { v28.h }[2], [x23], #0x2\n"
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[6], [x23]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[4], [x23]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x8, #1, 38f\n"
+    "ld1 { v28.h }[0], [x23], #0x2\n"
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[2], [x23]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 39f\n"
+    "ld1 { v28.b }[0], [x23]\n"
+    "39:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "ldr x22, [x14, #0x60]\n"
+    "smlal v23.4s, v28.4h, v5.4h\n"
+    "add x22, x22, x17\n"
+    "smlal2 v9.4s, v28.8h, v5.8h\n"
+    "smlal v24.4s, v28.4h, v2.4h\n"
+    "smlal2 v10.4s, v28.8h, v2.8h\n"
+    "tbz x8, #2, 41f\n"
+    "ld1 { v31.s }[0], [x22], #0x4\n"
+    "tbz x8, #1, 40f\n"
+    "ld1 { v31.h }[2], [x22], #0x2\n"
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[6], [x22]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[4], [x22]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x8, #1, 42f\n"
+    "ld1 { v31.h }[0], [x22], #0x2\n"
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[2], [x22]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 43f\n"
+    "ld1 { v31.b }[0], [x22]\n"
+    "43:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v31.8h, v31.8b, v21.8b\n"
+    "ldr x21, [x14, #0x68]\n"
+    "smlal v11.4s, v31.4h, v6.4h\n"
+    "add x21, x21, x17\n"
+    "smlal2 v26.4s, v31.8h, v6.8h\n"
+    "smlal v12.4s, v31.4h, v3.4h\n"
+    "smlal2 v22.4s, v31.8h, v3.8h\n"
+    "tbz x8, #2, 45f\n"
+    "ld1 { v30.s }[0], [x21], #0x4\n"
+    "tbz x8, #1, 44f\n"
+    "ld1 { v30.h }[2], [x21], #0x2\n"
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[6], [x21]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[4], [x21]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x8, #1, 46f\n"
+    "ld1 { v30.h }[0], [x21], #0x2\n"
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[2], [x21]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 47f\n"
+    "ld1 { v30.b }[0], [x21]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v30.8h, v30.8b, v21.8b\n"
+    "ldr x20, [x14, #0x70]\n"
+    "smlal v23.4s, v30.4h, v8.4h\n"
+    "add x20, x20, x17\n"
+    "smlal2 v9.4s, v30.8h, v8.8h\n"
+    "smlal v24.4s, v30.4h, v5.4h\n"
+    "smlal2 v10.4s, v30.8h, v5.8h\n"
+    "tbz x8, #2, 49f\n"
+    "ld1 { v29.s }[0], [x20], #0x4\n"
+    "tbz x8, #1, 48f\n"
+    "ld1 { v29.h }[2], [x20], #0x2\n"
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[6], [x20]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[4], [x20]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x8, #1, 50f\n"
+    "ld1 { v29.h }[0], [x20], #0x2\n"
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[2], [x20]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 51f\n"
+    "ld1 { v29.b }[0], [x20]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v29.8h, v29.8b, v21.8b\n"
+    "ldr x19, [x14, #0x78]\n"
+    "smlal v12.4s, v29.4h, v7.4h\n"
+    "add x19, x19, x17\n"
+    "smlal2 v22.4s, v29.8h, v7.8h\n"
+    "smlal v24.4s, v29.4h, v6.4h\n"
+    "smlal2 v10.4s, v29.8h, v6.8h\n"
+    "tbz x8, #2, 53f\n"
+    "ld1 { v28.s }[0], [x19], #0x4\n"
+    "tbz x8, #1, 52f\n"
+    "ld1 { v28.h }[2], [x19], #0x2\n"
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[6], [x19]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[4], [x19]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x8, #1, 54f\n"
+    "ld1 { v28.h }[0], [x19], #0x2\n"
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[2], [x19]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 55f\n"
+    "ld1 { v28.b }[0], [x19]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v28.8h, v28.8b, v21.8b\n"
+    "smlal v12.4s, v28.4h, v8.4h\n"
+    "smlal2 v22.4s, v28.8h, v8.8h\n"
+    "smlal v24.4s, v28.4h, v7.4h\n"
+    "smlal2 v10.4s, v28.8h, v7.8h\n"
+    "tbz x8, #2, 57f\n"
+    "ld1 { v25.4s }, [x13], #0x10\n"
+    "ld1 { v18.4s }, [x11], #0x10\n"
+    "tbz x8, #1, 56f\n"
+    "ld1 { v16.d }[0], [x13], #0x8\n"
+    "ld1 { v20.d }[0], [x11], #0x8\n"
+    "tbz x8, #0, 59f\n"
+    "ld1 { v16.s }[2], [x13]\n"
+    "ld1 { v20.s }[2], [x11]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 59f\n"
+    "ld1 { v16.s }[0], [x13]\n"
+    "ld1 { v20.s }[0], [x11]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x8, #1, 58f\n"
+    "ld1 { v25.d }[0], [x13], #0x8\n"
+    "ld1 { v18.d }[0], [x11], #0x8\n"
+    "tbz x8, #0, 59f\n"
+    "ld1 { v25.s }[2], [x13]\n"
+    "ld1 { v18.s }[2], [x11]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 59f\n"
+    "ld1 { v25.s }[0], [x13]\n"
+    "ld1 { v18.s }[0], [x11]\n"
+    "59:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v11.4s, v11.4s, v25.4s\n"
+    "add x10, x10, x15\n"
+    "sqrdmulh v26.4s, v26.4s, v16.4s\n"
+    "add x9, x9, x15\n"
+    "sqrdmulh v23.4s, v23.4s, v25.4s\n"
+    "add x28, x28, x15\n"
+    "sqrdmulh v9.4s, v9.4s, v16.4s\n"
+    "add x27, x27, x15\n"
+    "sqrdmulh v12.4s, v12.4s, v25.4s\n"
+    "and v19.16b, v11.16b, v18.16b\n"
+    "and v5.16b, v26.16b, v20.16b\n"
+    "and v28.16b, v23.16b, v18.16b\n"
+    "sshr v19.4s, v19.4s, #0x1f\n"
+    "sshr v5.4s, v5.4s, #0x1f\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "sqadd v11.4s, v11.4s, v19.4s\n"
+    "sqadd v26.4s, v26.4s, v5.4s\n"
+    "sqadd v23.4s, v23.4s, v28.4s\n"
+    "and v8.16b, v9.16b, v20.16b\n"
+    "srshl v11.4s, v11.4s, v18.4s\n"
+    "srshl v26.4s, v26.4s, v20.4s\n"
+    "srshl v23.4s, v23.4s, v18.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "add v11.4s, v11.4s, v13.4s\n"
+    "add v26.4s, v26.4s, v13.4s\n"
+    "add v23.4s, v23.4s, v13.4s\n"
+    "smin v11.4s, v11.4s, v14.4s\n"
+    "smin v26.4s, v26.4s, v14.4s\n"
+    "smin v23.4s, v23.4s, v14.4s\n"
+    "smax v11.4s, v11.4s, v15.4s\n"
+    "smax v26.4s, v26.4s, v15.4s\n"
+    "smax v23.4s, v23.4s, v15.4s\n"
+    "sqadd v9.4s, v9.4s, v8.4s\n"
+    "uzp1 v11.16b, v11.16b, v26.16b\n"
+    "and v1.16b, v12.16b, v18.16b\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "srshl v9.4s, v9.4s, v20.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "sqrdmulh v22.4s, v22.4s, v16.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v25.4s\n"
+    "add v9.4s, v9.4s, v13.4s\n"
+    "sqadd v12.4s, v12.4s, v1.4s\n"
+    "and v0.16b, v22.16b, v20.16b\n"
+    "smin v9.4s, v9.4s, v14.4s\n"
+    "and v26.16b, v24.16b, v18.16b\n"
+    "srshl v12.4s, v12.4s, v18.4s\n"
+    "smax v9.4s, v9.4s, v15.4s\n"
+    "sshr v0.4s, v0.4s, #0x1f\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "uzp1 v23.16b, v23.16b, v9.16b\n"
+    "add v12.4s, v12.4s, v13.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v22.4s, v22.4s, v0.4s\n"
+    "smin v12.4s, v12.4s, v14.4s\n"
+    "sqadd v24.4s, v24.4s, v26.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+    "smax v12.4s, v12.4s, v15.4s\n"
+    "srshl v22.4s, v22.4s, v20.4s\n"
+    "srshl v24.4s, v24.4s, v18.4s\n"
+    "and v16.16b, v10.16b, v20.16b\n"
+    "add v22.4s, v22.4s, v13.4s\n"
+    "add v24.4s, v24.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v22.4s, v22.4s, v14.4s\n"
+    "smin v24.4s, v24.4s, v14.4s\n"
+    "sqadd v10.4s, v10.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v15.4s\n"
+    "smax v24.4s, v24.4s, v15.4s\n"
+    "srshl v10.4s, v10.4s, v20.4s\n"
+    "uzp1 v12.16b, v12.16b, v22.16b\n"
+    "uzp1 v12.16b, v12.16b, v12.16b\n"
+    "add v10.4s, v10.4s, v13.4s\n"
+    "smin v10.4s, v10.4s, v14.4s\n"
+    "smax v10.4s, v10.4s, v15.4s\n"
+    "uzp1 v24.16b, v24.16b, v10.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "tbz x8, #2, 61f\n"
+    "st1 { v11.s }[0], [x10], #0x4\n"
+    "st1 { v23.s }[0], [x9], #0x4\n"
+    "st1 { v12.s }[0], [x28], #0x4\n"
+    "st1 { v24.s }[0], [x27], #0x4\n"
+    "tbz x8, #1, 60f\n"
+    "st1 { v11.h }[2], [x10], #0x2\n"
+    "st1 { v23.h }[2], [x9], #0x2\n"
+    "st1 { v12.h }[2], [x28], #0x2\n"
+    "st1 { v24.h }[2], [x27], #0x2\n"
+    "tbz x8, #0, 63f\n"
+    "st1 { v11.b }[6], [x10], #0x1\n"
+    "st1 { v23.b }[6], [x9], #0x1\n"
+    "st1 { v12.b }[6], [x28], #0x1\n"
+    "st1 { v24.b }[6], [x27], #0x1\n"
+    "b 63f\n"
+    "60:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x8, #0, 63f\n"
+    "st1 { v11.b }[4], [x10], #0x1\n"
+    "st1 { v23.b }[4], [x9], #0x1\n"
+    "st1 { v12.b }[4], [x28], #0x1\n"
+    "st1 { v24.b }[4], [x27], #0x1\n"
+    "b 63f\n"
+    "61:"  // Oddments: Bit 2: Unset
+    "tbz x8, #1, 62f\n"
+    "st1 { v11.h }[0], [x10], #0x2\n"
+    "st1 { v23.h }[0], [x9], #0x2\n"
+    "st1 { v12.h }[0], [x28], #0x2\n"
+    "st1 { v24.h }[0], [x27], #0x2\n"
+    "tbz x8, #0, 63f\n"
+    "st1 { v11.b }[2], [x10], #0x1\n"
+    "st1 { v23.b }[2], [x9], #0x1\n"
+    "st1 { v12.b }[2], [x28], #0x1\n"
+    "st1 { v24.b }[2], [x27], #0x1\n"
+    "b 63f\n"
+    "62:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x8, #0, 63f\n"
+    "st1 { v11.b }[0], [x10], #0x1\n"
+    "st1 { v23.b }[0], [x9], #0x1\n"
+    "st1 { v12.b }[0], [x28], #0x1\n"
+    "st1 { v24.b }[0], [x27], #0x1\n"
+    "63:"  // Oddments: Bit 2: End
+
+    "64:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..77861e9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef int8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+  a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..4e1586b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x4, #0x0\n"
+    "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x6, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x7, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x17, x3, #0x3\n"
+    "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v22.16b }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v12.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v16.4s }, [x20]\n"
+    "ld1r { v15.4s }, [x19]\n"
+    "ldp x15, x14, [x21, #0x0]\n"
+    "ldp x13, x12, [x21, #0x10]\n"
+    "cbz x17, 3f\n"
+    "subs x17, x17, #0x1\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q13, [x19, #0x0]\n"
+    "mov v19.16b, v13.16b\n"
+    "ldr q10, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v11.16b, v13.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v18.16b, v13.16b\n"
+    "ldr d0, [x5, #0x0]\n"
+    "ldr d1, [x5, #0x8]\n"
+    "mov v20.16b, v10.16b\n"
+    "ldr d2, [x5, #0x10]\n"
+    "mov v17.16b, v10.16b\n"
+    "ldr d3, [x5, #0x18]\n"
+    "mov v21.16b, v10.16b\n"
+    "ldr d4, [x5, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "ldr d5, [x5, #0x28]\n"
+    "ssubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d6, [x5, #0x30]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "ldr d7, [x5, #0x38]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldr d8, [x5, #0x40]\n"
+    "ssubl v4.8h, v4.8b, v12.8b\n"
+    "ldp x26, x25, [x7, #0x0]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "ldp x24, x23, [x7, #0x10]\n"
+    "ssubl v6.8h, v6.8b, v12.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldp x22, x21, [x7, #0x20]\n"
+    "ssubl v8.8h, v8.8b, v12.8b\n"
+    "ldp x20, x19, [x7, #0x30]\n"
+    "ldr d31, [x26, x4]\n"
+    "usubl v31.8h, v31.8b, v22.8b\n"
+    "ldr d30, [x25, x4]\n"
+    "ldr d29, [x24, x4]\n"
+    "usubl v30.8h, v30.8b, v22.8b\n"
+    "ldr d28, [x23, x4]\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "ldr d27, [x22, x4]\n"
+    "ldr d26, [x21, x4]\n"
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "ldr d25, [x20, x4]\n"
+    "ldr d24, [x19, x4]\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v13.4s, v31.4h, v8.4h\n"
+    "ldr x22, [x7, #0x40]\n"
+    "add x5, x5, #0x48\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "ldr x21, [x7, #0x48]\n"
+    "subs x17, x17, #0x1\n"
+    "smlal v19.4s, v31.4h, v6.4h\n"
+    "ldr x20, [x7, #0x50]\n"
+    "smlal2 v20.4s, v31.8h, v6.8h\n"
+    "ldr x19, [x7, #0x58]\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "ldr x11, [x7, #0x60]\n"
+    "smlal2 v17.4s, v31.8h, v2.8h\n"
+    "ldr x10, [x7, #0x68]\n"
+    "smlal v18.4s, v31.4h, v0.4h\n"
+    "ldr x9, [x7, #0x70]\n"
+    "smlal2 v21.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x7, #0x78]\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "ldr x27, [x7, #0x80]\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "ldr x26, [x7, #0x88]\n"
+    "smlal v19.4s, v28.4h, v1.4h\n"
+    "ldr x25, [x7, #0x90]\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x21, x4]\n"
+    "smlal v13.4s, v29.4h, v1.4h\n"
+    "ldr x24, [x7, #0x98]\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "ldr d29, [x22, x4]\n"
+    "smlal v19.4s, v27.4h, v2.4h\n"
+    "ldr x23, [x7, #0xa0]\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x20, x4]\n"
+    "smlal v13.4s, v26.4h, v3.4h\n"
+    "ldr x22, [x7, #0xa8]\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x19, x4]\n"
+    "smlal v19.4s, v24.4h, v0.4h\n"
+    "ldr x21, [x7, #0xb0]\n"
+    "smlal2 v20.4s, v24.8h, v0.8h\n"
+    "ldr x20, [x7, #0xb8]\n"
+    "smlal v13.4s, v25.4h, v4.4h\n"
+    "ldr x19, [x7, #0xc0]\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "ldr d25, [x11, x4]\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "ldr q31, [x8, #0x0]\n"
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "ldr q30, [x16, #0x0]\n"
+    "smlal v13.4s, v24.4h, v2.4h\n"
+    "ldr q23, [x8, #0x10]\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "ldr d24, [x9, x4]\n"
+    "smlal v19.4s, v29.4h, v4.4h\n"
+    "ldr q9, [x16, #0x10]\n"
+    "add x16, x16, #0x20\n"
+    "smlal2 v20.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x10, x4]\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "smlal v19.4s, v28.4h, v5.4h\n"
+    "smlal v13.4s, v27.4h, v5.4h\n"
+    "smlal2 v20.4s, v28.8h, v5.8h\n"
+    "ldr d28, [x27, x4]\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "smlal v19.4s, v27.4h, v3.4h\n"
+    "smlal v11.4s, v26.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x28, x4]\n"
+    "smlal2 v17.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x26, x4]\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "smlal v13.4s, v25.4h, v6.4h\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v17.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x25, x4]\n"
+    "smlal v13.4s, v24.4h, v7.4h\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "smlal v11.4s, v29.4h, v4.4h\n"
+    "smlal2 v17.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x24, x4]\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v17.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x22, x4]\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x4]\n"
+    "smlal v19.4s, v28.4h, v7.4h\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "smlal v18.4s, v28.4h, v1.4h\n"
+    "smlal2 v21.4s, v28.8h, v1.8h\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "smlal v18.4s, v26.4h, v5.4h\n"
+    "smlal2 v21.4s, v26.8h, v5.8h\n"
+    "ldr d26, [x21, x4]\n"
+    "smlal v11.4s, v25.4h, v6.4h\n"
+    "smlal2 v17.4s, v25.8h, v6.8h\n"
+    "ldr d25, [x20, x4]\n"
+    "smlal v19.4s, v29.4h, v8.4h\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "smlal v18.4s, v29.4h, v2.4h\n"
+    "smlal2 v21.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x19, x4]\n"
+    "add x4, x4, #0x8\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "smlal v11.4s, v27.4h, v7.4h\n"
+    "smlal2 v17.4s, v27.8h, v7.8h\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "smlal v11.4s, v24.4h, v5.4h\n"
+    "smlal2 v17.4s, v24.8h, v5.8h\n"
+    "smlal v18.4s, v26.4h, v7.4h\n"
+    "smlal2 v21.4s, v26.8h, v7.8h\n"
+    "smlal v11.4s, v25.4h, v8.4h\n"
+    "smlal2 v17.4s, v25.8h, v8.8h\n"
+    "smlal v18.4s, v25.4h, v6.4h\n"
+    "smlal2 v21.4s, v25.8h, v6.8h\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+    "smlal v18.4s, v29.4h, v8.4h\n"
+    "smlal2 v21.4s, v29.8h, v8.8h\n"
+    "and v27.16b, v13.16b, v30.16b\n"
+    "and v7.16b, v10.16b, v9.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v31.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "sqadd v13.4s, v13.4s, v27.4s\n"
+    "sqadd v10.4s, v10.4s, v7.4s\n"
+    "and v6.16b, v19.16b, v30.16b\n"
+    "and v3.16b, v20.16b, v9.16b\n"
+    "srshl v13.4s, v13.4s, v30.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "add v13.4s, v13.4s, v14.4s\n"
+    "add v10.4s, v10.4s, v14.4s\n"
+    "sqadd v19.4s, v19.4s, v6.4s\n"
+    "smin v13.4s, v13.4s, v15.4s\n"
+    "smin v10.4s, v10.4s, v15.4s\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "srshl v20.4s, v20.4s, v9.4s\n"
+    "uzp1 v13.16b, v13.16b, v10.16b\n"
+    "sqrdmulh v11.4s, v11.4s, v31.4s\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x15, x6]\n"
+    "add v19.4s, v19.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "and v28.16b, v11.16b, v30.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+    "smin v19.4s, v19.4s, v15.4s\n"
+    "smin v20.4s, v20.4s, v15.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "sqadd v11.4s, v11.4s, v28.4s\n"
+    "and v26.16b, v17.16b, v9.16b\n"
+    "uzp1 v19.16b, v19.16b, v20.16b\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d19, [x14, x6]\n"
+    "srshl v11.4s, v11.4s, v30.4s\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "and v8.16b, v18.16b, v30.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+    "sqadd v17.4s, v17.4s, v26.4s\n"
+    "add v11.4s, v11.4s, v14.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "and v27.16b, v21.16b, v9.16b\n"
+    "smin v11.4s, v11.4s, v15.4s\n"
+    "srshl v17.4s, v17.4s, v9.4s\n"
+    "sqadd v18.4s, v18.4s, v8.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "add v17.4s, v17.4s, v14.4s\n"
+    "srshl v18.4s, v18.4s, v30.4s\n"
+    "sqadd v21.4s, v21.4s, v27.4s\n"
+    "smin v17.4s, v17.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v14.4s\n"
+    "smax v17.4s, v17.4s, v16.4s\n"
+    "srshl v21.4s, v21.4s, v9.4s\n"
+    "smin v18.4s, v18.4s, v15.4s\n"
+    "uzp1 v11.16b, v11.16b, v17.16b\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str d11, [x13, x6]\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v18.16b, v18.16b, v21.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str d18, [x12, x6]\n"
+    "add x6, x6, #0x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q13, [x19, #0x0]\n"
+    "mov v19.16b, v13.16b\n"
+    "ldr q10, [x19, #0x10]\n"
+    "add x19, x19, #0x20\n"
+    "mov v11.16b, v13.16b\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v18.16b, v13.16b\n"
+    "ldr d0, [x5, #0x0]\n"
+    "ldr d1, [x5, #0x8]\n"
+    "mov v20.16b, v10.16b\n"
+    "ldr d2, [x5, #0x10]\n"
+    "mov v17.16b, v10.16b\n"
+    "ldr d3, [x5, #0x18]\n"
+    "mov v21.16b, v10.16b\n"
+    "ldr d4, [x5, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "ldr d5, [x5, #0x28]\n"
+    "ssubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d6, [x5, #0x30]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "ldr d7, [x5, #0x38]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldr d8, [x5, #0x40]\n"
+    "ssubl v4.8h, v4.8b, v12.8b\n"
+    "ldp x26, x25, [x7, #0x0]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "ldp x24, x23, [x7, #0x10]\n"
+    "ssubl v6.8h, v6.8b, v12.8b\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldp x22, x21, [x7, #0x20]\n"
+    "ssubl v8.8h, v8.8b, v12.8b\n"
+    "ldp x20, x19, [x7, #0x30]\n"
+    "ldr d31, [x26, x4]\n"
+    "usubl v31.8h, v31.8b, v22.8b\n"
+    "ldr d30, [x25, x4]\n"
+    "ldr d29, [x24, x4]\n"
+    "usubl v30.8h, v30.8b, v22.8b\n"
+    "ldr d28, [x23, x4]\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "ldr d27, [x22, x4]\n"
+    "ldr d26, [x21, x4]\n"
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "ldr d25, [x20, x4]\n"
+    "ldr d24, [x19, x4]\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v13.4s, v31.4h, v8.4h\n"
+    "ldr x22, [x7, #0x40]\n"
+    "tst x3, #0x7\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "ldr x21, [x7, #0x48]\n"
+    "smlal v19.4s, v31.4h, v6.4h\n"
+    "ldr x20, [x7, #0x50]\n"
+    "smlal2 v20.4s, v31.8h, v6.8h\n"
+    "ldr x19, [x7, #0x58]\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "ldr x11, [x7, #0x60]\n"
+    "smlal2 v17.4s, v31.8h, v2.8h\n"
+    "ldr x10, [x7, #0x68]\n"
+    "smlal v18.4s, v31.4h, v0.4h\n"
+    "ldr x9, [x7, #0x70]\n"
+    "smlal2 v21.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x7, #0x78]\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "ldr x27, [x7, #0x80]\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "ldr x26, [x7, #0x88]\n"
+    "smlal v19.4s, v28.4h, v1.4h\n"
+    "ldr x25, [x7, #0x90]\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x21, x4]\n"
+    "smlal v13.4s, v29.4h, v1.4h\n"
+    "ldr x24, [x7, #0x98]\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "ldr d29, [x22, x4]\n"
+    "smlal v19.4s, v27.4h, v2.4h\n"
+    "ldr x23, [x7, #0xa0]\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x20, x4]\n"
+    "smlal v13.4s, v26.4h, v3.4h\n"
+    "ldr x22, [x7, #0xa8]\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x19, x4]\n"
+    "smlal v19.4s, v24.4h, v0.4h\n"
+    "ldr x21, [x7, #0xb0]\n"
+    "smlal2 v20.4s, v24.8h, v0.8h\n"
+    "ldr x20, [x7, #0xb8]\n"
+    "smlal v13.4s, v25.4h, v4.4h\n"
+    "ldr x19, [x7, #0xc0]\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "ldr d25, [x11, x4]\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "ldr q31, [x8, #0x0]\n"
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "ldr q30, [x16, #0x0]\n"
+    "smlal v13.4s, v24.4h, v2.4h\n"
+    "ldr q23, [x8, #0x10]\n"
+    "add x8, x8, #0x20\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "ldr d24, [x9, x4]\n"
+    "smlal v19.4s, v29.4h, v4.4h\n"
+    "ldr q9, [x16, #0x10]\n"
+    "add x16, x16, #0x20\n"
+    "smlal2 v20.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x10, x4]\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "smlal v19.4s, v28.4h, v5.4h\n"
+    "smlal v13.4s, v27.4h, v5.4h\n"
+    "smlal2 v20.4s, v28.8h, v5.8h\n"
+    "ldr d28, [x27, x4]\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "smlal v19.4s, v27.4h, v3.4h\n"
+    "smlal v11.4s, v26.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x28, x4]\n"
+    "smlal2 v17.4s, v26.8h, v3.8h\n"
+    "ldr d26, [x26, x4]\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "smlal v13.4s, v25.4h, v6.4h\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v17.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x25, x4]\n"
+    "smlal v13.4s, v24.4h, v7.4h\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "smlal v11.4s, v29.4h, v4.4h\n"
+    "smlal2 v17.4s, v29.8h, v4.8h\n"
+    "ldr d29, [x24, x4]\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v17.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x22, x4]\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x4]\n"
+    "smlal v19.4s, v28.4h, v7.4h\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "smlal v18.4s, v28.4h, v1.4h\n"
+    "smlal2 v21.4s, v28.8h, v1.8h\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "smlal v18.4s, v26.4h, v5.4h\n"
+    "smlal2 v21.4s, v26.8h, v5.8h\n"
+    "ldr d26, [x21, x4]\n"
+    "smlal v11.4s, v25.4h, v6.4h\n"
+    "smlal2 v17.4s, v25.8h, v6.8h\n"
+    "ldr d25, [x20, x4]\n"
+    "smlal v19.4s, v29.4h, v8.4h\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "smlal v18.4s, v29.4h, v2.4h\n"
+    "smlal2 v21.4s, v29.8h, v2.8h\n"
+    "ldr d29, [x19, x4]\n"
+    "add x4, x4, #0x8\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "smlal v11.4s, v27.4h, v7.4h\n"
+    "smlal2 v17.4s, v27.8h, v7.8h\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "smlal v11.4s, v24.4h, v5.4h\n"
+    "smlal2 v17.4s, v24.8h, v5.8h\n"
+    "smlal v18.4s, v26.4h, v7.4h\n"
+    "smlal2 v21.4s, v26.8h, v7.8h\n"
+    "smlal v11.4s, v25.4h, v8.4h\n"
+    "smlal2 v17.4s, v25.8h, v8.8h\n"
+    "smlal v18.4s, v25.4h, v6.4h\n"
+    "smlal2 v21.4s, v25.8h, v6.8h\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+    "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+    "smlal v18.4s, v29.4h, v8.4h\n"
+    "smlal2 v21.4s, v29.8h, v8.8h\n"
+    "and v27.16b, v13.16b, v30.16b\n"
+    "and v7.16b, v10.16b, v9.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v31.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "sqadd v13.4s, v13.4s, v27.4s\n"
+    "sqadd v10.4s, v10.4s, v7.4s\n"
+    "and v6.16b, v19.16b, v30.16b\n"
+    "and v3.16b, v20.16b, v9.16b\n"
+    "srshl v13.4s, v13.4s, v30.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "add v13.4s, v13.4s, v14.4s\n"
+    "add v10.4s, v10.4s, v14.4s\n"
+    "sqadd v19.4s, v19.4s, v6.4s\n"
+    "smin v13.4s, v13.4s, v15.4s\n"
+    "smin v10.4s, v10.4s, v15.4s\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "srshl v20.4s, v20.4s, v9.4s\n"
+    "uzp1 v13.16b, v13.16b, v10.16b\n"
+    "sqrdmulh v11.4s, v11.4s, v31.4s\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "str d13, [x15, x6]\n"
+    "add v19.4s, v19.4s, v14.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "and v28.16b, v11.16b, v30.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+    "smin v19.4s, v19.4s, v15.4s\n"
+    "smin v20.4s, v20.4s, v15.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "sqadd v11.4s, v11.4s, v28.4s\n"
+    "and v26.16b, v17.16b, v9.16b\n"
+    "uzp1 v19.16b, v19.16b, v20.16b\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str d19, [x14, x6]\n"
+    "srshl v11.4s, v11.4s, v30.4s\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "and v8.16b, v18.16b, v30.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+    "sqadd v17.4s, v17.4s, v26.4s\n"
+    "add v11.4s, v11.4s, v14.4s\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "and v27.16b, v21.16b, v9.16b\n"
+    "smin v11.4s, v11.4s, v15.4s\n"
+    "srshl v17.4s, v17.4s, v9.4s\n"
+    "sqadd v18.4s, v18.4s, v8.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "add v17.4s, v17.4s, v14.4s\n"
+    "srshl v18.4s, v18.4s, v30.4s\n"
+    "sqadd v21.4s, v21.4s, v27.4s\n"
+    "smin v17.4s, v17.4s, v15.4s\n"
+    "add v18.4s, v18.4s, v14.4s\n"
+    "smax v17.4s, v17.4s, v16.4s\n"
+    "srshl v21.4s, v21.4s, v9.4s\n"
+    "smin v18.4s, v18.4s, v15.4s\n"
+    "uzp1 v11.16b, v11.16b, v17.16b\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "str d11, [x13, x6]\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v18.16b, v18.16b, v21.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "str d18, [x12, x6]\n"
+    "add x6, x6, #0x8\n"
+    "beq 88f\n"
+    "add x5, x5, #0x48\n"
+    "3:"  // Oddments
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x3, #2, 5f\n"
+    "ld1 { v13.4s }, [x19], #0x10\n"
+    "tbz x3, #1, 4f\n"
+    "ld1 { v10.d }[0], [x19], #0x8\n"
+    "tbz x3, #0, 7f\n"
+    "ld1 { v10.s }[2], [x19]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x3, #0, 7f\n"
+    "ld1 { v10.s }[0], [x19]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x3, #1, 6f\n"
+    "ld1 { v13.d }[0], [x19], #0x8\n"
+    "tbz x3, #0, 7f\n"
+    "ld1 { v13.s }[2], [x19]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 7f\n"
+    "ld1 { v13.s }[0], [x19]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v19.16b, v13.16b\n"
+    "ldr d0, [x5, #0x0]\n"
+    "mov v20.16b, v10.16b\n"
+    "ldr d1, [x5, #0x8]\n"
+    "mov v11.16b, v13.16b\n"
+    "ldr d2, [x5, #0x10]\n"
+    "mov v17.16b, v10.16b\n"
+    "ldr d3, [x5, #0x18]\n"
+    "mov v18.16b, v13.16b\n"
+    "ldr d4, [x5, #0x20]\n"
+    "mov v21.16b, v10.16b\n"
+    "ldr d5, [x5, #0x28]\n"
+    "ssubl v0.8h, v0.8b, v12.8b\n"
+    "ldr d6, [x5, #0x30]\n"
+    "ssubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d7, [x5, #0x38]\n"
+    "ssubl v2.8h, v2.8b, v12.8b\n"
+    "ldr d8, [x5, #0x40]\n"
+    "ssubl v3.8h, v3.8b, v12.8b\n"
+    "ldp x26, x25, [x7, #0x0]\n"
+    "add x26, x26, x4\n"
+    "ssubl v4.8h, v4.8b, v12.8b\n"
+    "ldp x24, x23, [x7, #0x10]\n"
+    "ssubl v5.8h, v5.8b, v12.8b\n"
+    "ldp x22, x21, [x7, #0x20]\n"
+    "ssubl v6.8h, v6.8b, v12.8b\n"
+    "add x25, x25, x4\n"
+    "ssubl v7.8h, v7.8b, v12.8b\n"
+    "ldp x20, x19, [x7, #0x30]\n"
+    "ssubl v8.8h, v8.8b, v12.8b\n"
+    "add x24, x24, x4\n"
+    "add x23, x23, x4\n"
+    "add x22, x22, x4\n"
+    "add x21, x21, x4\n"
+    "add x20, x20, x4\n"
+    "add x19, x19, x4\n"
+    "tbz x3, #2, 9f\n"
+    "ld1 { v31.s }[0], [x26], #0x4\n"
+    "ld1 { v30.s }[0], [x25], #0x4\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "ld1 { v28.s }[0], [x23], #0x4\n"
+    "ld1 { v27.s }[0], [x22], #0x4\n"
+    "ld1 { v26.s }[0], [x21], #0x4\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "ld1 { v24.s }[0], [x19], #0x4\n"
+    "tbz x3, #1, 8f\n"
+    "ld1 { v31.h }[2], [x26], #0x2\n"
+    "ld1 { v30.h }[2], [x25], #0x2\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "ld1 { v28.h }[2], [x23], #0x2\n"
+    "ld1 { v27.h }[2], [x22], #0x2\n"
+    "ld1 { v26.h }[2], [x21], #0x2\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "ld1 { v24.h }[2], [x19], #0x2\n"
+    "tbz x3, #0, 11f\n"
+    "ld1 { v31.b }[6], [x26]\n"
+    "ld1 { v30.b }[6], [x25]\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "ld1 { v28.b }[6], [x23]\n"
+    "ld1 { v27.b }[6], [x22]\n"
+    "ld1 { v26.b }[6], [x21]\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "ld1 { v24.b }[6], [x19]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x3, #0, 11f\n"
+    "ld1 { v31.b }[4], [x26]\n"
+    "ld1 { v30.b }[4], [x25]\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "ld1 { v28.b }[4], [x23]\n"
+    "ld1 { v27.b }[4], [x22]\n"
+    "ld1 { v26.b }[4], [x21]\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "ld1 { v24.b }[4], [x19]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x3, #1, 10f\n"
+    "ld1 { v31.h }[0], [x26], #0x2\n"
+    "ld1 { v30.h }[0], [x25], #0x2\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "ld1 { v28.h }[0], [x23], #0x2\n"
+    "ld1 { v27.h }[0], [x22], #0x2\n"
+    "ld1 { v26.h }[0], [x21], #0x2\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "ld1 { v24.h }[0], [x19], #0x2\n"
+    "tbz x3, #0, 11f\n"
+    "ld1 { v31.b }[2], [x26]\n"
+    "ld1 { v30.b }[2], [x25]\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "ld1 { v28.b }[2], [x23]\n"
+    "ld1 { v27.b }[2], [x22]\n"
+    "ld1 { v26.b }[2], [x21]\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "ld1 { v24.b }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 11f\n"
+    "ld1 { v31.b }[0], [x26]\n"
+    "ld1 { v30.b }[0], [x25]\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "ld1 { v28.b }[0], [x23]\n"
+    "ld1 { v27.b }[0], [x22]\n"
+    "ld1 { v26.b }[0], [x21]\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "ld1 { v24.b }[0], [x19]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v31.8h, v31.8b, v22.8b\n"
+    "ldr x22, [x7, #0x40]\n"
+    "add x22, x22, x4\n"
+    "usubl v30.8h, v30.8b, v22.8b\n"
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "smlal v13.4s, v31.4h, v8.4h\n"
+    "smlal2 v10.4s, v31.8h, v8.8h\n"
+    "smlal v19.4s, v31.4h, v6.4h\n"
+    "smlal2 v20.4s, v31.8h, v6.8h\n"
+    "smlal v11.4s, v31.4h, v2.4h\n"
+    "smlal2 v17.4s, v31.8h, v2.8h\n"
+    "smlal v18.4s, v31.4h, v0.4h\n"
+    "smlal2 v21.4s, v31.8h, v0.8h\n"
+    "smlal v13.4s, v30.4h, v0.4h\n"
+    "smlal2 v10.4s, v30.8h, v0.8h\n"
+    "smlal v19.4s, v28.4h, v1.4h\n"
+    "smlal2 v20.4s, v28.8h, v1.8h\n"
+    "smlal v13.4s, v29.4h, v1.4h\n"
+    "smlal2 v10.4s, v29.8h, v1.8h\n"
+    "smlal v19.4s, v27.4h, v2.4h\n"
+    "smlal2 v20.4s, v27.8h, v2.8h\n"
+    "smlal v13.4s, v26.4h, v3.4h\n"
+    "smlal2 v10.4s, v26.8h, v3.8h\n"
+    "smlal v19.4s, v24.4h, v0.4h\n"
+    "smlal2 v20.4s, v24.8h, v0.8h\n"
+    "smlal v13.4s, v25.4h, v4.4h\n"
+    "smlal2 v10.4s, v25.8h, v4.8h\n"
+    "smlal v13.4s, v24.4h, v2.4h\n"
+    "smlal2 v10.4s, v24.8h, v2.8h\n"
+    "tbz x3, #2, 13f\n"
+    "ld1 { v29.s }[0], [x22], #0x4\n"
+    "tbz x3, #1, 12f\n"
+    "ld1 { v29.h }[2], [x22], #0x2\n"
+    "tbz x3, #0, 15f\n"
+    "ld1 { v29.b }[6], [x22]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 15f\n"
+    "ld1 { v29.b }[4], [x22]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x3, #1, 14f\n"
+    "ld1 { v29.h }[0], [x22], #0x2\n"
+    "tbz x3, #0, 15f\n"
+    "ld1 { v29.b }[2], [x22]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 15f\n"
+    "ld1 { v29.b }[0], [x22]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "ldr x21, [x7, #0x48]\n"
+    "smlal v19.4s, v29.4h, v4.4h\n"
+    "add x21, x21, x4\n"
+    "smlal2 v20.4s, v29.8h, v4.8h\n"
+    "tbz x3, #2, 17f\n"
+    "ld1 { v28.s }[0], [x21], #0x4\n"
+    "tbz x3, #1, 16f\n"
+    "ld1 { v28.h }[2], [x21], #0x2\n"
+    "tbz x3, #0, 19f\n"
+    "ld1 { v28.b }[6], [x21]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 19f\n"
+    "ld1 { v28.b }[4], [x21]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x3, #1, 18f\n"
+    "ld1 { v28.h }[0], [x21], #0x2\n"
+    "tbz x3, #0, 19f\n"
+    "ld1 { v28.b }[2], [x21]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 19f\n"
+    "ld1 { v28.b }[0], [x21]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "ldr x20, [x7, #0x50]\n"
+    "smlal v19.4s, v28.4h, v5.4h\n"
+    "add x20, x20, x4\n"
+    "smlal2 v20.4s, v28.8h, v5.8h\n"
+    "tbz x3, #2, 21f\n"
+    "ld1 { v27.s }[0], [x20], #0x4\n"
+    "tbz x3, #1, 20f\n"
+    "ld1 { v27.h }[2], [x20], #0x2\n"
+    "tbz x3, #0, 23f\n"
+    "ld1 { v27.b }[6], [x20]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 23f\n"
+    "ld1 { v27.b }[4], [x20]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (1, 2): Bit 2: Unset
+    "tbz x3, #1, 22f\n"
+    "ld1 { v27.h }[0], [x20], #0x2\n"
+    "tbz x3, #0, 23f\n"
+    "ld1 { v27.b }[2], [x20]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 23f\n"
+    "ld1 { v27.b }[0], [x20]\n"
+    "23:"  // Oddments: Load (1, 2): Bit 2: End
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "ldr x19, [x7, #0x58]\n"
+    "smlal v13.4s, v27.4h, v5.4h\n"
+    "add x19, x19, x4\n"
+    "smlal2 v10.4s, v27.8h, v5.8h\n"
+    "smlal v19.4s, v27.4h, v3.4h\n"
+    "smlal2 v20.4s, v27.8h, v3.8h\n"
+    "tbz x3, #2, 25f\n"
+    "ld1 { v26.s }[0], [x19], #0x4\n"
+    "tbz x3, #1, 24f\n"
+    "ld1 { v26.h }[2], [x19], #0x2\n"
+    "tbz x3, #0, 27f\n"
+    "ld1 { v26.b }[6], [x19]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 27f\n"
+    "ld1 { v26.b }[4], [x19]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x3, #1, 26f\n"
+    "ld1 { v26.h }[0], [x19], #0x2\n"
+    "tbz x3, #0, 27f\n"
+    "ld1 { v26.b }[2], [x19]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 27f\n"
+    "ld1 { v26.b }[0], [x19]\n"
+    "27:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "ldr x11, [x7, #0x60]\n"
+    "smlal v11.4s, v26.4h, v3.4h\n"
+    "add x11, x11, x4\n"
+    "smlal2 v17.4s, v26.8h, v3.8h\n"
+    "tbz x3, #2, 29f\n"
+    "ld1 { v25.s }[0], [x11], #0x4\n"
+    "tbz x3, #1, 28f\n"
+    "ld1 { v25.h }[2], [x11], #0x2\n"
+    "tbz x3, #0, 31f\n"
+    "ld1 { v25.b }[6], [x11]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 31f\n"
+    "ld1 { v25.b }[4], [x11]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 0): Bit 2: Unset
+    "tbz x3, #1, 30f\n"
+    "ld1 { v25.h }[0], [x11], #0x2\n"
+    "tbz x3, #0, 31f\n"
+    "ld1 { v25.b }[2], [x11]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 31f\n"
+    "ld1 { v25.b }[0], [x11]\n"
+    "31:"  // Oddments: Load (2, 0): Bit 2: End
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "ldr x10, [x7, #0x68]\n"
+    "smlal v13.4s, v25.4h, v6.4h\n"
+    "add x10, x10, x4\n"
+    "smlal2 v10.4s, v25.8h, v6.8h\n"
+    "smlal v11.4s, v25.4h, v0.4h\n"
+    "smlal2 v17.4s, v25.8h, v0.8h\n"
+    "tbz x3, #2, 33f\n"
+    "ld1 { v29.s }[0], [x10], #0x4\n"
+    "tbz x3, #1, 32f\n"
+    "ld1 { v29.h }[2], [x10], #0x2\n"
+    "tbz x3, #0, 35f\n"
+    "ld1 { v29.b }[6], [x10]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 35f\n"
+    "ld1 { v29.b }[4], [x10]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x3, #1, 34f\n"
+    "ld1 { v29.h }[0], [x10], #0x2\n"
+    "tbz x3, #0, 35f\n"
+    "ld1 { v29.b }[2], [x10]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 35f\n"
+    "ld1 { v29.b }[0], [x10]\n"
+    "35:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "ldr x9, [x7, #0x70]\n"
+    "smlal v11.4s, v29.4h, v4.4h\n"
+    "add x9, x9, x4\n"
+    "smlal2 v17.4s, v29.8h, v4.8h\n"
+    "tbz x3, #2, 37f\n"
+    "ld1 { v24.s }[0], [x9], #0x4\n"
+    "tbz x3, #1, 36f\n"
+    "ld1 { v24.h }[2], [x9], #0x2\n"
+    "tbz x3, #0, 39f\n"
+    "ld1 { v24.b }[6], [x9]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 39f\n"
+    "ld1 { v24.b }[4], [x9]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x3, #1, 38f\n"
+    "ld1 { v24.h }[0], [x9], #0x2\n"
+    "tbz x3, #0, 39f\n"
+    "ld1 { v24.b }[2], [x9]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 39f\n"
+    "ld1 { v24.b }[0], [x9]\n"
+    "39:"  // Oddments: Load (2, 1): Bit 2: End
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "ldr x28, [x7, #0x78]\n"
+    "smlal v13.4s, v24.4h, v7.4h\n"
+    "add x28, x28, x4\n"
+    "smlal2 v10.4s, v24.8h, v7.8h\n"
+    "smlal v11.4s, v24.4h, v1.4h\n"
+    "smlal2 v17.4s, v24.8h, v1.8h\n"
+    "tbz x3, #2, 41f\n"
+    "ld1 { v27.s }[0], [x28], #0x4\n"
+    "tbz x3, #1, 40f\n"
+    "ld1 { v27.h }[2], [x28], #0x2\n"
+    "tbz x3, #0, 43f\n"
+    "ld1 { v27.b }[6], [x28]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 43f\n"
+    "ld1 { v27.b }[4], [x28]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x3, #1, 42f\n"
+    "ld1 { v27.h }[0], [x28], #0x2\n"
+    "tbz x3, #0, 43f\n"
+    "ld1 { v27.b }[2], [x28]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 43f\n"
+    "ld1 { v27.b }[0], [x28]\n"
+    "43:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "ldr x27, [x7, #0x80]\n"
+    "smlal v18.4s, v27.4h, v4.4h\n"
+    "add x27, x27, x4\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "tbz x3, #2, 45f\n"
+    "ld1 { v28.s }[0], [x27], #0x4\n"
+    "tbz x3, #1, 44f\n"
+    "ld1 { v28.h }[2], [x27], #0x2\n"
+    "tbz x3, #0, 47f\n"
+    "ld1 { v28.b }[6], [x27]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 47f\n"
+    "ld1 { v28.b }[4], [x27]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x3, #1, 46f\n"
+    "ld1 { v28.h }[0], [x27], #0x2\n"
+    "tbz x3, #0, 47f\n"
+    "ld1 { v28.b }[2], [x27]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 47f\n"
+    "ld1 { v28.b }[0], [x27]\n"
+    "47:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v28.8h, v28.8b, v22.8b\n"
+    "ldr x26, [x7, #0x88]\n"
+    "smlal v19.4s, v28.4h, v7.4h\n"
+    "add x26, x26, x4\n"
+    "smlal2 v20.4s, v28.8h, v7.8h\n"
+    "smlal v18.4s, v28.4h, v1.4h\n"
+    "smlal2 v21.4s, v28.8h, v1.8h\n"
+    "tbz x3, #2, 49f\n"
+    "ld1 { v26.s }[0], [x26], #0x4\n"
+    "tbz x3, #1, 48f\n"
+    "ld1 { v26.h }[2], [x26], #0x2\n"
+    "tbz x3, #0, 51f\n"
+    "ld1 { v26.b }[6], [x26]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 51f\n"
+    "ld1 { v26.b }[4], [x26]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x3, #1, 50f\n"
+    "ld1 { v26.h }[0], [x26], #0x2\n"
+    "tbz x3, #0, 51f\n"
+    "ld1 { v26.b }[2], [x26]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 51f\n"
+    "ld1 { v26.b }[0], [x26]\n"
+    "51:"  // Oddments: Load (3, 4): Bit 2: End
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "ldr x25, [x7, #0x90]\n"
+    "smlal v18.4s, v26.4h, v5.4h\n"
+    "add x25, x25, x4\n"
+    "smlal2 v21.4s, v26.8h, v5.8h\n"
+    "tbz x3, #2, 53f\n"
+    "ld1 { v25.s }[0], [x25], #0x4\n"
+    "tbz x3, #1, 52f\n"
+    "ld1 { v25.h }[2], [x25], #0x2\n"
+    "tbz x3, #0, 55f\n"
+    "ld1 { v25.b }[6], [x25]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 55f\n"
+    "ld1 { v25.b }[4], [x25]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x3, #1, 54f\n"
+    "ld1 { v25.h }[0], [x25], #0x2\n"
+    "tbz x3, #0, 55f\n"
+    "ld1 { v25.b }[2], [x25]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 55f\n"
+    "ld1 { v25.b }[0], [x25]\n"
+    "55:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "ldr x24, [x7, #0x98]\n"
+    "smlal v11.4s, v25.4h, v6.4h\n"
+    "add x24, x24, x4\n"
+    "smlal2 v17.4s, v25.8h, v6.8h\n"
+    "tbz x3, #2, 57f\n"
+    "ld1 { v29.s }[0], [x24], #0x4\n"
+    "tbz x3, #1, 56f\n"
+    "ld1 { v29.h }[2], [x24], #0x2\n"
+    "tbz x3, #0, 59f\n"
+    "ld1 { v29.b }[6], [x24]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 59f\n"
+    "ld1 { v29.b }[4], [x24]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x3, #1, 58f\n"
+    "ld1 { v29.h }[0], [x24], #0x2\n"
+    "tbz x3, #0, 59f\n"
+    "ld1 { v29.b }[2], [x24]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 59f\n"
+    "ld1 { v29.b }[0], [x24]\n"
+    "59:"  // Oddments: Load (2, 4): Bit 2: End
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "ldr x23, [x7, #0xa0]\n"
+    "smlal v19.4s, v29.4h, v8.4h\n"
+    "add x23, x23, x4\n"
+    "smlal2 v20.4s, v29.8h, v8.8h\n"
+    "smlal v18.4s, v29.4h, v2.4h\n"
+    "smlal2 v21.4s, v29.8h, v2.8h\n"
+    "tbz x3, #2, 61f\n"
+    "ld1 { v27.s }[0], [x23], #0x4\n"
+    "tbz x3, #1, 60f\n"
+    "ld1 { v27.h }[2], [x23], #0x2\n"
+    "tbz x3, #0, 63f\n"
+    "ld1 { v27.b }[6], [x23]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 63f\n"
+    "ld1 { v27.b }[4], [x23]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x3, #1, 62f\n"
+    "ld1 { v27.h }[0], [x23], #0x2\n"
+    "tbz x3, #0, 63f\n"
+    "ld1 { v27.b }[2], [x23]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 63f\n"
+    "ld1 { v27.b }[0], [x23]\n"
+    "63:"  // Oddments: Load (4, 1): Bit 2: End
+    "usubl v27.8h, v27.8b, v22.8b\n"
+    "ldr x22, [x7, #0xa8]\n"
+    "smlal v11.4s, v27.4h, v7.4h\n"
+    "add x22, x22, x4\n"
+    "smlal2 v17.4s, v27.8h, v7.8h\n"
+    "tbz x3, #2, 65f\n"
+    "ld1 { v24.s }[0], [x22], #0x4\n"
+    "tbz x3, #1, 64f\n"
+    "ld1 { v24.h }[2], [x22], #0x2\n"
+    "tbz x3, #0, 67f\n"
+    "ld1 { v24.b }[6], [x22]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 67f\n"
+    "ld1 { v24.b }[4], [x22]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x3, #1, 66f\n"
+    "ld1 { v24.h }[0], [x22], #0x2\n"
+    "tbz x3, #0, 67f\n"
+    "ld1 { v24.b }[2], [x22]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 67f\n"
+    "ld1 { v24.b }[0], [x22]\n"
+    "67:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v24.8h, v24.8b, v22.8b\n"
+    "ldr x21, [x7, #0xb0]\n"
+    "smlal v11.4s, v24.4h, v5.4h\n"
+    "add x21, x21, x4\n"
+    "smlal2 v17.4s, v24.8h, v5.8h\n"
+    "smlal v18.4s, v24.4h, v3.4h\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "tbz x3, #2, 69f\n"
+    "ld1 { v26.s }[0], [x21], #0x4\n"
+    "tbz x3, #1, 68f\n"
+    "ld1 { v26.h }[2], [x21], #0x2\n"
+    "tbz x3, #0, 71f\n"
+    "ld1 { v26.b }[6], [x21]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 71f\n"
+    "ld1 { v26.b }[4], [x21]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x3, #1, 70f\n"
+    "ld1 { v26.h }[0], [x21], #0x2\n"
+    "tbz x3, #0, 71f\n"
+    "ld1 { v26.b }[2], [x21]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 71f\n"
+    "ld1 { v26.b }[0], [x21]\n"
+    "71:"  // Oddments: Load (4, 3): Bit 2: End
+    "usubl v26.8h, v26.8b, v22.8b\n"
+    "ldr x20, [x7, #0xb8]\n"
+    "smlal v18.4s, v26.4h, v7.4h\n"
+    "add x20, x20, x4\n"
+    "smlal2 v21.4s, v26.8h, v7.8h\n"
+    "tbz x3, #2, 73f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x3, #1, 72f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x3, #0, 75f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 75f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x3, #1, 74f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x3, #0, 75f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 75f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "75:"  // Oddments: Load (4, 2): Bit 2: End
+    "usubl v25.8h, v25.8b, v22.8b\n"
+    "ldr x19, [x7, #0xc0]\n"
+    "smlal v11.4s, v25.4h, v8.4h\n"
+    "add x19, x19, x4\n"
+    "smlal2 v17.4s, v25.8h, v8.8h\n"
+    "smlal v18.4s, v25.4h, v6.4h\n"
+    "smlal2 v21.4s, v25.8h, v6.8h\n"
+    "tbz x3, #2, 77f\n"
+    "ld1 { v29.s }[0], [x19], #0x4\n"
+    "tbz x3, #1, 76f\n"
+    "ld1 { v29.h }[2], [x19], #0x2\n"
+    "tbz x3, #0, 79f\n"
+    "ld1 { v29.b }[6], [x19]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x3, #0, 79f\n"
+    "ld1 { v29.b }[4], [x19]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x3, #1, 78f\n"
+    "ld1 { v29.h }[0], [x19], #0x2\n"
+    "tbz x3, #0, 79f\n"
+    "ld1 { v29.b }[2], [x19]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 79f\n"
+    "ld1 { v29.b }[0], [x19]\n"
+    "79:"  // Oddments: Load (4, 4): Bit 2: End
+    "usubl v29.8h, v29.8b, v22.8b\n"
+    "smlal v18.4s, v29.4h, v8.4h\n"
+    "smlal2 v21.4s, v29.8h, v8.8h\n"
+    "tbz x3, #2, 81f\n"
+    "ld1 { v31.4s }, [x8], #0x10\n"
+    "ld1 { v30.4s }, [x16], #0x10\n"
+    "tbz x3, #1, 80f\n"
+    "ld1 { v23.d }[0], [x8], #0x8\n"
+    "ld1 { v9.d }[0], [x16], #0x8\n"
+    "tbz x3, #0, 83f\n"
+    "ld1 { v23.s }[2], [x8]\n"
+    "ld1 { v9.s }[2], [x16]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x3, #0, 83f\n"
+    "ld1 { v23.s }[0], [x8]\n"
+    "ld1 { v9.s }[0], [x16]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x3, #1, 82f\n"
+    "ld1 { v31.d }[0], [x8], #0x8\n"
+    "ld1 { v30.d }[0], [x16], #0x8\n"
+    "tbz x3, #0, 83f\n"
+    "ld1 { v31.s }[2], [x8]\n"
+    "ld1 { v30.s }[2], [x16]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 83f\n"
+    "ld1 { v31.s }[0], [x8]\n"
+    "ld1 { v30.s }[0], [x16]\n"
+    "83:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+    "add x15, x15, x6\n"
+    "sqrdmulh v10.4s, v10.4s, v23.4s\n"
+    "add x14, x14, x6\n"
+    "sqrdmulh v19.4s, v19.4s, v31.4s\n"
+    "add x13, x13, x6\n"
+    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+    "add x12, x12, x6\n"
+    "sqrdmulh v11.4s, v11.4s, v31.4s\n"
+    "and v27.16b, v13.16b, v30.16b\n"
+    "and v7.16b, v10.16b, v9.16b\n"
+    "and v6.16b, v19.16b, v30.16b\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "sshr v7.4s, v7.4s, #0x1f\n"
+    "sshr v6.4s, v6.4s, #0x1f\n"
+    "sqadd v13.4s, v13.4s, v27.4s\n"
+    "sqadd v10.4s, v10.4s, v7.4s\n"
+    "sqadd v19.4s, v19.4s, v6.4s\n"
+    "and v3.16b, v20.16b, v9.16b\n"
+    "srshl v13.4s, v13.4s, v30.4s\n"
+    "srshl v10.4s, v10.4s, v9.4s\n"
+    "srshl v19.4s, v19.4s, v30.4s\n"
+    "sshr v3.4s, v3.4s, #0x1f\n"
+    "add v13.4s, v13.4s, v14.4s\n"
+    "add v10.4s, v10.4s, v14.4s\n"
+    "add v19.4s, v19.4s, v14.4s\n"
+    "smin v13.4s, v13.4s, v15.4s\n"
+    "smin v10.4s, v10.4s, v15.4s\n"
+    "smin v19.4s, v19.4s, v15.4s\n"
+    "smax v13.4s, v13.4s, v16.4s\n"
+    "smax v10.4s, v10.4s, v16.4s\n"
+    "smax v19.4s, v19.4s, v16.4s\n"
+    "sqadd v20.4s, v20.4s, v3.4s\n"
+    "uzp1 v13.16b, v13.16b, v10.16b\n"
+    "and v28.16b, v11.16b, v30.16b\n"
+    "uzp1 v13.16b, v13.16b, v13.16b\n"
+    "srshl v20.4s, v20.4s, v9.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v31.4s\n"
+    "add v20.4s, v20.4s, v14.4s\n"
+    "sqadd v11.4s, v11.4s, v28.4s\n"
+    "and v26.16b, v17.16b, v9.16b\n"
+    "smin v20.4s, v20.4s, v15.4s\n"
+    "and v8.16b, v18.16b, v30.16b\n"
+    "srshl v11.4s, v11.4s, v30.4s\n"
+    "smax v20.4s, v20.4s, v16.4s\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sshr v8.4s, v8.4s, #0x1f\n"
+    "uzp1 v19.16b, v19.16b, v20.16b\n"
+    "add v11.4s, v11.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "sqadd v17.4s, v17.4s, v26.4s\n"
+    "smin v11.4s, v11.4s, v15.4s\n"
+    "sqadd v18.4s, v18.4s, v8.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+    "smax v11.4s, v11.4s, v16.4s\n"
+    "srshl v17.4s, v17.4s, v9.4s\n"
+    "srshl v18.4s, v18.4s, v30.4s\n"
+    "and v27.16b, v21.16b, v9.16b\n"
+    "add v17.4s, v17.4s, v14.4s\n"
+    "add v18.4s, v18.4s, v14.4s\n"
+    "sshr v27.4s, v27.4s, #0x1f\n"
+    "smin v17.4s, v17.4s, v15.4s\n"
+    "smin v18.4s, v18.4s, v15.4s\n"
+    "sqadd v21.4s, v21.4s, v27.4s\n"
+    "smax v17.4s, v17.4s, v16.4s\n"
+    "smax v18.4s, v18.4s, v16.4s\n"
+    "srshl v21.4s, v21.4s, v9.4s\n"
+    "uzp1 v11.16b, v11.16b, v17.16b\n"
+    "uzp1 v11.16b, v11.16b, v11.16b\n"
+    "add v21.4s, v21.4s, v14.4s\n"
+    "smin v21.4s, v21.4s, v15.4s\n"
+    "smax v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v18.16b, v18.16b, v21.16b\n"
+    "uzp1 v18.16b, v18.16b, v18.16b\n"
+    "tbz x3, #2, 85f\n"
+    "st1 { v13.s }[0], [x15], #0x4\n"
+    "st1 { v19.s }[0], [x14], #0x4\n"
+    "st1 { v11.s }[0], [x13], #0x4\n"
+    "st1 { v18.s }[0], [x12], #0x4\n"
+    "tbz x3, #1, 84f\n"
+    "st1 { v13.h }[2], [x15], #0x2\n"
+    "st1 { v19.h }[2], [x14], #0x2\n"
+    "st1 { v11.h }[2], [x13], #0x2\n"
+    "st1 { v18.h }[2], [x12], #0x2\n"
+    "tbz x3, #0, 87f\n"
+    "st1 { v13.b }[6], [x15], #0x1\n"
+    "st1 { v19.b }[6], [x14], #0x1\n"
+    "st1 { v11.b }[6], [x13], #0x1\n"
+    "st1 { v18.b }[6], [x12], #0x1\n"
+    "b 87f\n"
+    "84:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x3, #0, 87f\n"
+    "st1 { v13.b }[4], [x15], #0x1\n"
+    "st1 { v19.b }[4], [x14], #0x1\n"
+    "st1 { v11.b }[4], [x13], #0x1\n"
+    "st1 { v18.b }[4], [x12], #0x1\n"
+    "b 87f\n"
+    "85:"  // Oddments: Bit 2: Unset
+    "tbz x3, #1, 86f\n"
+    "st1 { v13.h }[0], [x15], #0x2\n"
+    "st1 { v19.h }[0], [x14], #0x2\n"
+    "st1 { v11.h }[0], [x13], #0x2\n"
+    "st1 { v18.h }[0], [x12], #0x2\n"
+    "tbz x3, #0, 87f\n"
+    "st1 { v13.b }[2], [x15], #0x1\n"
+    "st1 { v19.b }[2], [x14], #0x1\n"
+    "st1 { v11.b }[2], [x13], #0x1\n"
+    "st1 { v18.b }[2], [x12], #0x1\n"
+    "b 87f\n"
+    "86:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x3, #0, 87f\n"
+    "st1 { v13.b }[0], [x15], #0x1\n"
+    "st1 { v19.b }[0], [x14], #0x1\n"
+    "st1 { v11.b }[0], [x13], #0x1\n"
+    "st1 { v18.b }[0], [x12], #0x1\n"
+    "87:"  // Oddments: Bit 2: End
+
+    "88:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..d3d5000
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef int8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size;
+
+  kern_type kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+  a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..9715613
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,2213 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "mov x10, #0x0\n"
+    "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x1, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "add x25, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "lsr x19, x4, #0x3\n"
+    "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v9.16b }, [x13]\n"
+    "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v14.16b }, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1r { v10.4s }, [x8]\n"
+    "add x8, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "ld1r { v13.4s }, [x8]\n"
+    "ldp x17, x16, [x21, #0x0]\n"
+    "ldp x6, x8, [x21, #0x10]\n"
+    "cbz x19, 3f\n"
+    "subs x19, x19, #0x1\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x12, #0x0]\n"
+    "mov v16.16b, v15.16b\n"
+    "ldr q18, [x12, #0x10]\n"
+    "add x12, x12, #0x20\n"
+    "mov v7.16b, v15.16b\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v8.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v21.16b, v18.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "mov v17.16b, v18.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "mov v5.16b, v18.16b\n"
+    "ldr d4, [x3, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "ldr d31, [x28, x10]\n"
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ldr d30, [x27, x10]\n"
+    "ldr d29, [x26, x10]\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ldr d28, [x13, x10]\n"
+    "usubl v29.8h, v29.8b, v9.8b\n"
+    "ldr d27, [x24, x10]\n"
+    "ldr d23, [x23, x10]\n"
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "ldr d25, [x22, x10]\n"
+    "ldr d24, [x21, x10]\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "ldr d26, [x20, x10]\n"
+    "ldr d22, [x0, x10]\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "usubl v22.8h, v22.8b, v9.8b\n"
+    "beq 2f\n"
+    "1:"  // Loop
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "ldr x20, [x25, #0x50]\n"
+    "subs x19, x19, #0x1\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x25, #0x58]\n"
+    "smlal v16.4s, v30.4h, v0.4h\n"
+    "ldr x0, [x25, #0x60]\n"
+    "smlal2 v21.4s, v30.8h, v0.8h\n"
+    "ldr d31, [x20, x10]\n"
+    "smlal v7.4s, v29.4h, v0.4h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "smlal2 v17.4s, v29.8h, v0.8h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "smlal v8.4s, v28.4h, v0.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "ldr d0, [x3, #0x28]\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "smlal2 v18.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "smlal v16.4s, v27.4h, v1.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "smlal2 v21.4s, v27.8h, v1.8h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "smlal v7.4s, v28.4h, v1.4h\n"
+    "ldr x21, [x25, #0x98]\n"
+    "smlal2 v17.4s, v28.8h, v1.8h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "smlal v8.4s, v23.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "ldr d1, [x3, #0x30]\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "smlal2 v18.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x0, x10]\n"
+    "smlal v16.4s, v25.4h, v2.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "smlal2 v21.4s, v25.8h, v2.8h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "smlal v7.4s, v23.4h, v2.4h\n"
+    "ldr x9, [x25, #0xc8]\n"
+    "smlal2 v17.4s, v23.8h, v2.8h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "ldr q6, [x2, #0x0]\n"
+    "smlal2 v18.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x7, x10]\n"
+    "smlal v8.4s, v31.4h, v2.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "ldr d2, [x3, #0x38]\n"
+    "smlal v16.4s, v24.4h, v3.4h\n"
+    "ldr q19, [x5, #0x0]\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "ldr q20, [x2, #0x10]\n"
+    "add x2, x2, #0x20\n"
+    "smlal v7.4s, v31.4h, v3.4h\n"
+    "ldr q12, [x5, #0x10]\n"
+    "add x5, x5, #0x20\n"
+    "smlal2 v17.4s, v31.8h, v3.8h\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v18.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x26, x10]\n"
+    "smlal v8.4s, v30.4h, v3.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "ldr d3, [x3, #0x40]\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "smlal v7.4s, v30.4h, v4.4h\n"
+    "smlal2 v17.4s, v30.8h, v4.8h\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x10]\n"
+    "smlal v8.4s, v26.4h, v4.4h\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0x48]\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v18.4s, v29.8h, v0.8h\n"
+    "smlal v16.4s, v28.4h, v0.4h\n"
+    "smlal2 v21.4s, v28.8h, v0.8h\n"
+    "smlal v7.4s, v22.4h, v0.4h\n"
+    "smlal2 v17.4s, v22.8h, v0.8h\n"
+    "smlal v8.4s, v25.4h, v0.4h\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "ldr d0, [x3, #0x50]\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x10]\n"
+    "smlal v16.4s, v23.4h, v1.4h\n"
+    "ldr x23, [x25, #0xf8]\n"
+    "smlal2 v21.4s, v23.8h, v1.8h\n"
+    "smlal v7.4s, v25.4h, v1.4h\n"
+    "smlal2 v17.4s, v25.8h, v1.8h\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "smlal v8.4s, v24.4h, v1.4h\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "ldr d1, [x3, #0x58]\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v18.4s, v23.8h, v2.8h\n"
+    "ldr d23, [x20, x10]\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "smlal2 v21.4s, v31.8h, v2.8h\n"
+    "smlal v7.4s, v24.4h, v2.4h\n"
+    "smlal2 v17.4s, v24.8h, v2.8h\n"
+    "smlal v8.4s, v27.4h, v2.4h\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "ldr d2, [x3, #0x60]\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v18.4s, v31.8h, v3.8h\n"
+    "ldr d31, [x13, x10]\n"
+    "smlal v16.4s, v30.4h, v3.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "smlal2 v21.4s, v30.8h, v3.8h\n"
+    "smlal v7.4s, v27.4h, v3.4h\n"
+    "smlal2 v17.4s, v27.8h, v3.8h\n"
+    "smlal v8.4s, v23.4h, v3.4h\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "ldr d3, [x3, #0x68]\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v18.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x10]\n"
+    "smlal v16.4s, v26.4h, v4.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "smlal2 v21.4s, v26.8h, v4.8h\n"
+    "ldr d26, [x14, x10]\n"
+    "smlal v7.4s, v23.4h, v4.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "smlal2 v17.4s, v23.8h, v4.8h\n"
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "smlal v8.4s, v28.4h, v4.4h\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "ldr d4, [x3, #0x70]\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v18.4s, v22.8h, v0.8h\n"
+    "ldr d22, [x0, x10]\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v21.4s, v25.8h, v0.8h\n"
+    "smlal v7.4s, v31.4h, v0.4h\n"
+    "smlal2 v17.4s, v31.8h, v0.8h\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "smlal v8.4s, v30.4h, v0.4h\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "ldr d0, [x3, #0x78]\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v18.4s, v25.8h, v1.8h\n"
+    "ldr d25, [x11, x10]\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v21.4s, v24.8h, v1.8h\n"
+    "smlal v7.4s, v30.4h, v1.4h\n"
+    "smlal2 v17.4s, v30.8h, v1.8h\n"
+    "smlal v8.4s, v26.4h, v1.4h\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "ldr d1, [x3, #0x80]\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v18.4s, v24.8h, v2.8h\n"
+    "ldr d24, [x24, x10]\n"
+    "smlal v16.4s, v27.4h, v2.4h\n"
+    "smlal2 v21.4s, v27.8h, v2.8h\n"
+    "smlal v7.4s, v26.4h, v2.4h\n"
+    "smlal2 v17.4s, v26.8h, v2.8h\n"
+    "smlal v8.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ldr d2, [x3, #0x88]\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v18.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x15, x10]\n"
+    "smlal v16.4s, v23.4h, v3.4h\n"
+    "smlal2 v21.4s, v23.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v3.4h\n"
+    "smlal2 v17.4s, v25.8h, v3.8h\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v22.8h, v22.8b, v9.8b\n"
+    "smlal v8.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "ldr d3, [x3, #0x90]\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v18.4s, v23.8h, v4.8h\n"
+    "ldr d23, [x9, x10]\n"
+    "smlal v16.4s, v28.4h, v4.4h\n"
+    "smlal2 v21.4s, v28.8h, v4.8h\n"
+    "ldr d28, [x12, x10]\n"
+    "smlal v7.4s, v24.4h, v4.4h\n"
+    "smlal2 v17.4s, v24.8h, v4.8h\n"
+    "smlal v8.4s, v22.4h, v4.4h\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "ldr d4, [x3, #0x98]\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x27, x10]\n"
+    "smlal v16.4s, v30.4h, v0.4h\n"
+    "smlal2 v21.4s, v30.8h, v0.8h\n"
+    "smlal v7.4s, v27.4h, v0.4h\n"
+    "smlal2 v17.4s, v27.8h, v0.8h\n"
+    "smlal v8.4s, v23.4h, v0.4h\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "ldr d0, [x3, #0xa0]\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v18.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "smlal v16.4s, v26.4h, v1.4h\n"
+    "smlal2 v21.4s, v26.8h, v1.8h\n"
+    "smlal v7.4s, v23.4h, v1.4h\n"
+    "smlal2 v17.4s, v23.8h, v1.8h\n"
+    "smlal v8.4s, v31.4h, v1.4h\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "ldr d1, [x3, #0xa8]\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v18.4s, v26.8h, v2.8h\n"
+    "ldr d26, [x7, x10]\n"
+    "smlal v16.4s, v25.4h, v2.4h\n"
+    "smlal2 v21.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v31.4h, v2.4h\n"
+    "smlal2 v17.4s, v31.8h, v2.8h\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "smlal v8.4s, v30.4h, v2.4h\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ldr d2, [x3, #0xb0]\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v18.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x26, x10]\n"
+    "smlal v16.4s, v24.4h, v3.4h\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "smlal v7.4s, v30.4h, v3.4h\n"
+    "smlal2 v17.4s, v30.8h, v3.8h\n"
+    "smlal v8.4s, v28.4h, v3.4h\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "ldr d3, [x3, #0xb8]\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v18.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x23, x10]\n"
+    "smlal v16.4s, v22.4h, v4.4h\n"
+    "smlal2 v21.4s, v22.8h, v4.8h\n"
+    "smlal v7.4s, v28.4h, v4.4h\n"
+    "smlal2 v17.4s, v28.8h, v4.8h\n"
+    "smlal v8.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0xc0]\n"
+    "add x3, x3, #0xc8\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "smlal2 v18.4s, v27.8h, v0.8h\n"
+    "ldr d27, [x22, x10]\n"
+    "smlal v16.4s, v23.4h, v0.4h\n"
+    "smlal2 v21.4s, v23.8h, v0.8h\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "smlal v7.4s, v25.4h, v0.4h\n"
+    "smlal2 v17.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x20, x10]\n"
+    "smlal v8.4s, v24.4h, v0.4h\n"
+    "smlal2 v5.4s, v24.8h, v0.8h\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v18.4s, v23.8h, v1.8h\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "smlal v7.4s, v24.4h, v1.4h\n"
+    "smlal2 v17.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x13, x10]\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "smlal v8.4s, v27.4h, v1.4h\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v18.4s, v31.8h, v2.8h\n"
+    "smlal v16.4s, v30.4h, v2.4h\n"
+    "smlal2 v21.4s, v30.8h, v2.8h\n"
+    "smlal v7.4s, v27.4h, v2.4h\n"
+    "smlal2 v17.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x10]\n"
+    "add x10, x10, #0x8\n"
+    "smlal v8.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v18.4s, v30.8h, v3.8h\n"
+    "smlal v16.4s, v28.4h, v3.4h\n"
+    "smlal2 v21.4s, v28.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v3.4h\n"
+    "smlal2 v17.4s, v25.8h, v3.8h\n"
+    "smlal v8.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v18.4s, v28.8h, v4.8h\n"
+    "smlal v16.4s, v26.4h, v4.4h\n"
+    "smlal2 v21.4s, v26.8h, v4.8h\n"
+    "smlal v7.4s, v24.4h, v4.4h\n"
+    "smlal2 v17.4s, v24.8h, v4.8h\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+    "smlal v8.4s, v27.4h, v4.4h\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "and v28.16b, v15.16b, v19.16b\n"
+    "and v26.16b, v18.16b, v12.16b\n"
+    "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "sqadd v15.4s, v15.4s, v28.4s\n"
+    "sqadd v18.4s, v18.4s, v26.4s\n"
+    "and v29.16b, v16.16b, v19.16b\n"
+    "and v4.16b, v21.16b, v12.16b\n"
+    "srshl v15.4s, v15.4s, v19.4s\n"
+    "srshl v18.4s, v18.4s, v12.4s\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v10.4s\n"
+    "add v18.4s, v18.4s, v10.4s\n"
+    "sqadd v16.4s, v16.4s, v29.4s\n"
+    "smin v15.4s, v15.4s, v13.4s\n"
+    "smin v18.4s, v18.4s, v13.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "smax v15.4s, v15.4s, v11.4s\n"
+    "smax v18.4s, v18.4s, v11.4s\n"
+    "srshl v16.4s, v16.4s, v19.4s\n"
+    "srshl v21.4s, v21.4s, v12.4s\n"
+    "uzp1 v15.16b, v15.16b, v18.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v6.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x17, x1]\n"
+    "add v16.4s, v16.4s, v10.4s\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "and v25.16b, v7.16b, v19.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+    "smin v16.4s, v16.4s, v13.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "smax v16.4s, v16.4s, v11.4s\n"
+    "smax v21.4s, v21.4s, v11.4s\n"
+    "sqadd v7.4s, v7.4s, v25.4s\n"
+    "and v31.16b, v17.16b, v12.16b\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v6.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x16, x1]\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v24.16b, v8.16b, v19.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v20.4s\n"
+    "sqadd v17.4s, v17.4s, v31.4s\n"
+    "add v7.4s, v7.4s, v10.4s\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "and v1.16b, v5.16b, v12.16b\n"
+    "smin v7.4s, v7.4s, v13.4s\n"
+    "srshl v17.4s, v17.4s, v12.4s\n"
+    "sqadd v8.4s, v8.4s, v24.4s\n"
+    "smax v7.4s, v7.4s, v11.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "add v17.4s, v17.4s, v10.4s\n"
+    "srshl v8.4s, v8.4s, v19.4s\n"
+    "sqadd v5.4s, v5.4s, v1.4s\n"
+    "smin v17.4s, v17.4s, v13.4s\n"
+    "add v8.4s, v8.4s, v10.4s\n"
+    "smax v17.4s, v17.4s, v11.4s\n"
+    "srshl v5.4s, v5.4s, v12.4s\n"
+    "smin v8.4s, v8.4s, v13.4s\n"
+    "uzp1 v7.16b, v7.16b, v17.16b\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x6, x1]\n"
+    "smax v8.4s, v8.4s, v11.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "smax v5.4s, v5.4s, v11.4s\n"
+    "uzp1 v8.16b, v8.16b, v5.16b\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d8, [x8, x1]\n"
+    "add x1, x1, #0x8\n"
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "ldr q15, [x12, #0x0]\n"
+    "mov v16.16b, v15.16b\n"
+    "ldr q18, [x12, #0x10]\n"
+    "add x12, x12, #0x20\n"
+    "mov v7.16b, v15.16b\n"
+    "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "mov v8.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v21.16b, v18.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "mov v17.16b, v18.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "mov v5.16b, v18.16b\n"
+    "ldr d4, [x3, #0x20]\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "ldr d31, [x28, x10]\n"
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ldr d30, [x27, x10]\n"
+    "ldr d29, [x26, x10]\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ldr d28, [x13, x10]\n"
+    "usubl v29.8h, v29.8b, v9.8b\n"
+    "ldr d27, [x24, x10]\n"
+    "ldr d23, [x23, x10]\n"
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "ldr d25, [x22, x10]\n"
+    "ldr d24, [x21, x10]\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "ldr d26, [x20, x10]\n"
+    "ldr d22, [x0, x10]\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "usubl v22.8h, v22.8b, v9.8b\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "ldr x20, [x25, #0x50]\n"
+    "tst x4, #0x7\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "ldr x28, [x25, #0x58]\n"
+    "smlal v16.4s, v30.4h, v0.4h\n"
+    "ldr x0, [x25, #0x60]\n"
+    "smlal2 v21.4s, v30.8h, v0.8h\n"
+    "ldr d31, [x20, x10]\n"
+    "smlal v7.4s, v29.4h, v0.4h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "smlal2 v17.4s, v29.8h, v0.8h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "smlal v8.4s, v28.4h, v0.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "ldr d0, [x3, #0x28]\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "smlal2 v18.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "smlal v16.4s, v27.4h, v1.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "smlal2 v21.4s, v27.8h, v1.8h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "smlal v7.4s, v28.4h, v1.4h\n"
+    "ldr x21, [x25, #0x98]\n"
+    "smlal2 v17.4s, v28.8h, v1.8h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "smlal v8.4s, v23.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "ldr d1, [x3, #0x30]\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "smlal2 v18.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x0, x10]\n"
+    "smlal v16.4s, v25.4h, v2.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "smlal2 v21.4s, v25.8h, v2.8h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "smlal v7.4s, v23.4h, v2.4h\n"
+    "ldr x9, [x25, #0xc8]\n"
+    "smlal2 v17.4s, v23.8h, v2.8h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "smlal2 v18.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x7, x10]\n"
+    "smlal v8.4s, v31.4h, v2.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "ldr d2, [x3, #0x38]\n"
+    "smlal v16.4s, v24.4h, v3.4h\n"
+    "ldr q6, [x2, #0x0]\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "ldr q19, [x5, #0x0]\n"
+    "smlal v7.4s, v31.4h, v3.4h\n"
+    "ldr q20, [x2, #0x10]\n"
+    "add x2, x2, #0x20\n"
+    "smlal2 v17.4s, v31.8h, v3.8h\n"
+    "ldr q12, [x5, #0x10]\n"
+    "add x5, x5, #0x20\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v18.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x26, x10]\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "smlal v8.4s, v30.4h, v3.4h\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "ldr d3, [x3, #0x40]\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "ldr d27, [x23, x10]\n"
+    "smlal v7.4s, v30.4h, v4.4h\n"
+    "ldr x23, [x25, #0xf8]\n"
+    "smlal2 v17.4s, v30.8h, v4.8h\n"
+    "smlal v8.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0x48]\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v18.4s, v29.8h, v0.8h\n"
+    "smlal v16.4s, v28.4h, v0.4h\n"
+    "smlal2 v21.4s, v28.8h, v0.8h\n"
+    "smlal v7.4s, v22.4h, v0.4h\n"
+    "smlal2 v17.4s, v22.8h, v0.8h\n"
+    "smlal v8.4s, v25.4h, v0.4h\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "ldr d0, [x3, #0x50]\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "ldr d28, [x22, x10]\n"
+    "smlal v16.4s, v23.4h, v1.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "smlal2 v21.4s, v23.8h, v1.8h\n"
+    "smlal v7.4s, v25.4h, v1.4h\n"
+    "smlal2 v17.4s, v25.8h, v1.8h\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "smlal v8.4s, v24.4h, v1.4h\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "ldr d1, [x3, #0x58]\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v18.4s, v23.8h, v2.8h\n"
+    "ldr d23, [x20, x10]\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "smlal2 v21.4s, v31.8h, v2.8h\n"
+    "smlal v7.4s, v24.4h, v2.4h\n"
+    "smlal2 v17.4s, v24.8h, v2.8h\n"
+    "smlal v8.4s, v27.4h, v2.4h\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "ldr d2, [x3, #0x60]\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v18.4s, v31.8h, v3.8h\n"
+    "ldr d31, [x13, x10]\n"
+    "smlal v16.4s, v30.4h, v3.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "smlal2 v21.4s, v30.8h, v3.8h\n"
+    "smlal v7.4s, v27.4h, v3.4h\n"
+    "smlal2 v17.4s, v27.8h, v3.8h\n"
+    "smlal v8.4s, v23.4h, v3.4h\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "ldr d3, [x3, #0x68]\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v18.4s, v30.8h, v4.8h\n"
+    "ldr d30, [x21, x10]\n"
+    "smlal v16.4s, v26.4h, v4.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "smlal2 v21.4s, v26.8h, v4.8h\n"
+    "ldr d26, [x14, x10]\n"
+    "smlal v7.4s, v23.4h, v4.4h\n"
+    "smlal2 v17.4s, v23.8h, v4.8h\n"
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "smlal v8.4s, v28.4h, v4.4h\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "ldr d4, [x3, #0x70]\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v18.4s, v22.8h, v0.8h\n"
+    "ldr d22, [x0, x10]\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v21.4s, v25.8h, v0.8h\n"
+    "smlal v7.4s, v31.4h, v0.4h\n"
+    "smlal2 v17.4s, v31.8h, v0.8h\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "smlal v8.4s, v30.4h, v0.4h\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "ldr d0, [x3, #0x78]\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v18.4s, v25.8h, v1.8h\n"
+    "ldr d25, [x11, x10]\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v21.4s, v24.8h, v1.8h\n"
+    "smlal v7.4s, v30.4h, v1.4h\n"
+    "smlal2 v17.4s, v30.8h, v1.8h\n"
+    "smlal v8.4s, v26.4h, v1.4h\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "ldr d1, [x3, #0x80]\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v18.4s, v24.8h, v2.8h\n"
+    "ldr d24, [x24, x10]\n"
+    "smlal v16.4s, v27.4h, v2.4h\n"
+    "smlal2 v21.4s, v27.8h, v2.8h\n"
+    "smlal v7.4s, v26.4h, v2.4h\n"
+    "smlal2 v17.4s, v26.8h, v2.8h\n"
+    "smlal v8.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ldr d2, [x3, #0x88]\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v18.4s, v27.8h, v3.8h\n"
+    "ldr d27, [x15, x10]\n"
+    "smlal v16.4s, v23.4h, v3.4h\n"
+    "smlal2 v21.4s, v23.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v3.4h\n"
+    "smlal2 v17.4s, v25.8h, v3.8h\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v22.8h, v22.8b, v9.8b\n"
+    "smlal v8.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "ldr d3, [x3, #0x90]\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v18.4s, v23.8h, v4.8h\n"
+    "ldr d23, [x9, x10]\n"
+    "smlal v16.4s, v28.4h, v4.4h\n"
+    "smlal2 v21.4s, v28.8h, v4.8h\n"
+    "ldr d28, [x12, x10]\n"
+    "smlal v7.4s, v24.4h, v4.4h\n"
+    "smlal2 v17.4s, v24.8h, v4.8h\n"
+    "smlal v8.4s, v22.4h, v4.4h\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "ldr d4, [x3, #0x98]\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "ldr d31, [x27, x10]\n"
+    "smlal v16.4s, v30.4h, v0.4h\n"
+    "smlal2 v21.4s, v30.8h, v0.8h\n"
+    "smlal v7.4s, v27.4h, v0.4h\n"
+    "smlal2 v17.4s, v27.8h, v0.8h\n"
+    "smlal v8.4s, v23.4h, v0.4h\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "ldr d0, [x3, #0xa0]\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v18.4s, v30.8h, v1.8h\n"
+    "ldr d30, [x28, x10]\n"
+    "smlal v16.4s, v26.4h, v1.4h\n"
+    "smlal2 v21.4s, v26.8h, v1.8h\n"
+    "smlal v7.4s, v23.4h, v1.4h\n"
+    "smlal2 v17.4s, v23.8h, v1.8h\n"
+    "smlal v8.4s, v31.4h, v1.4h\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "ldr d1, [x3, #0xa8]\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v18.4s, v26.8h, v2.8h\n"
+    "ldr d26, [x7, x10]\n"
+    "smlal v16.4s, v25.4h, v2.4h\n"
+    "smlal2 v21.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v31.4h, v2.4h\n"
+    "smlal2 v17.4s, v31.8h, v2.8h\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "smlal v8.4s, v30.4h, v2.4h\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ldr d2, [x3, #0xb0]\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v18.4s, v25.8h, v3.8h\n"
+    "ldr d25, [x26, x10]\n"
+    "smlal v16.4s, v24.4h, v3.4h\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "smlal v7.4s, v30.4h, v3.4h\n"
+    "smlal2 v17.4s, v30.8h, v3.8h\n"
+    "smlal v8.4s, v28.4h, v3.4h\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "ldr d3, [x3, #0xb8]\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v18.4s, v24.8h, v4.8h\n"
+    "ldr d24, [x23, x10]\n"
+    "smlal v16.4s, v22.4h, v4.4h\n"
+    "smlal2 v21.4s, v22.8h, v4.8h\n"
+    "smlal v7.4s, v28.4h, v4.4h\n"
+    "smlal2 v17.4s, v28.8h, v4.8h\n"
+    "smlal v8.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ldr d4, [x3, #0xc0]\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "smlal2 v18.4s, v27.8h, v0.8h\n"
+    "ldr d27, [x22, x10]\n"
+    "smlal v16.4s, v23.4h, v0.4h\n"
+    "smlal2 v21.4s, v23.8h, v0.8h\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "smlal v7.4s, v25.4h, v0.4h\n"
+    "smlal2 v17.4s, v25.8h, v0.8h\n"
+    "ldr d25, [x20, x10]\n"
+    "smlal v8.4s, v24.4h, v0.4h\n"
+    "smlal2 v5.4s, v24.8h, v0.8h\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v18.4s, v23.8h, v1.8h\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "smlal v7.4s, v24.4h, v1.4h\n"
+    "smlal2 v17.4s, v24.8h, v1.8h\n"
+    "ldr d24, [x13, x10]\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "smlal v8.4s, v27.4h, v1.4h\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v18.4s, v31.8h, v2.8h\n"
+    "smlal v16.4s, v30.4h, v2.4h\n"
+    "smlal2 v21.4s, v30.8h, v2.8h\n"
+    "smlal v7.4s, v27.4h, v2.4h\n"
+    "smlal2 v17.4s, v27.8h, v2.8h\n"
+    "ldr d27, [x21, x10]\n"
+    "add x10, x10, #0x8\n"
+    "smlal v8.4s, v25.4h, v2.4h\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v18.4s, v30.8h, v3.8h\n"
+    "smlal v16.4s, v28.4h, v3.4h\n"
+    "smlal2 v21.4s, v28.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v3.4h\n"
+    "smlal2 v17.4s, v25.8h, v3.8h\n"
+    "smlal v8.4s, v24.4h, v3.4h\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v18.4s, v28.8h, v4.8h\n"
+    "smlal v16.4s, v26.4h, v4.4h\n"
+    "smlal2 v21.4s, v26.8h, v4.8h\n"
+    "smlal v7.4s, v24.4h, v4.4h\n"
+    "smlal2 v17.4s, v24.8h, v4.8h\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+    "smlal v8.4s, v27.4h, v4.4h\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "and v28.16b, v15.16b, v19.16b\n"
+    "and v26.16b, v18.16b, v12.16b\n"
+    "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "sqadd v15.4s, v15.4s, v28.4s\n"
+    "sqadd v18.4s, v18.4s, v26.4s\n"
+    "and v29.16b, v16.16b, v19.16b\n"
+    "and v4.16b, v21.16b, v12.16b\n"
+    "srshl v15.4s, v15.4s, v19.4s\n"
+    "srshl v18.4s, v18.4s, v12.4s\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v10.4s\n"
+    "add v18.4s, v18.4s, v10.4s\n"
+    "sqadd v16.4s, v16.4s, v29.4s\n"
+    "smin v15.4s, v15.4s, v13.4s\n"
+    "smin v18.4s, v18.4s, v13.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "smax v15.4s, v15.4s, v11.4s\n"
+    "smax v18.4s, v18.4s, v11.4s\n"
+    "srshl v16.4s, v16.4s, v19.4s\n"
+    "srshl v21.4s, v21.4s, v12.4s\n"
+    "uzp1 v15.16b, v15.16b, v18.16b\n"
+    "sqrdmulh v7.4s, v7.4s, v6.4s\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "str d15, [x17, x1]\n"
+    "add v16.4s, v16.4s, v10.4s\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "and v25.16b, v7.16b, v19.16b\n"
+    "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+    "smin v16.4s, v16.4s, v13.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "smax v16.4s, v16.4s, v11.4s\n"
+    "smax v21.4s, v21.4s, v11.4s\n"
+    "sqadd v7.4s, v7.4s, v25.4s\n"
+    "and v31.16b, v17.16b, v12.16b\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "sqrdmulh v8.4s, v8.4s, v6.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "str d16, [x16, x1]\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "and v24.16b, v8.16b, v19.16b\n"
+    "sqrdmulh v5.4s, v5.4s, v20.4s\n"
+    "sqadd v17.4s, v17.4s, v31.4s\n"
+    "add v7.4s, v7.4s, v10.4s\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "and v1.16b, v5.16b, v12.16b\n"
+    "smin v7.4s, v7.4s, v13.4s\n"
+    "srshl v17.4s, v17.4s, v12.4s\n"
+    "sqadd v8.4s, v8.4s, v24.4s\n"
+    "smax v7.4s, v7.4s, v11.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "add v17.4s, v17.4s, v10.4s\n"
+    "srshl v8.4s, v8.4s, v19.4s\n"
+    "sqadd v5.4s, v5.4s, v1.4s\n"
+    "smin v17.4s, v17.4s, v13.4s\n"
+    "add v8.4s, v8.4s, v10.4s\n"
+    "smax v17.4s, v17.4s, v11.4s\n"
+    "srshl v5.4s, v5.4s, v12.4s\n"
+    "smin v8.4s, v8.4s, v13.4s\n"
+    "uzp1 v7.16b, v7.16b, v17.16b\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "str d7, [x6, x1]\n"
+    "smax v8.4s, v8.4s, v11.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "smax v5.4s, v5.4s, v11.4s\n"
+    "uzp1 v8.16b, v8.16b, v5.16b\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "str d8, [x8, x1]\n"
+    "add x1, x1, #0x8\n"
+    "beq 124f\n"
+    "add x3, x3, #0xc8\n"
+    "3:"  // Oddments
+    "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+    "tbz x4, #2, 5f\n"
+    "ld1 { v15.4s }, [x12], #0x10\n"
+    "tbz x4, #1, 4f\n"
+    "ld1 { v18.d }[0], [x12], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v18.s }[2], [x12]\n"
+    "b 7f\n"
+    "4:"  // Oddments: Load bias: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v18.s }[0], [x12]\n"
+    "b 7f\n"
+    "5:"  // Oddments: Load bias: Bit 2: Unset
+    "tbz x4, #1, 6f\n"
+    "ld1 { v15.d }[0], [x12], #0x8\n"
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[2], [x12]\n"
+    "b 7f\n"
+    "6:"  // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 7f\n"
+    "ld1 { v15.s }[0], [x12]\n"
+    "7:"  // Oddments: Load bias: Bit 2: End
+    "mov v16.16b, v15.16b\n"
+    "ldr d0, [x3, #0x0]\n"
+    "mov v21.16b, v18.16b\n"
+    "ldr d1, [x3, #0x8]\n"
+    "mov v7.16b, v15.16b\n"
+    "ldr d2, [x3, #0x10]\n"
+    "mov v17.16b, v18.16b\n"
+    "ldr d3, [x3, #0x18]\n"
+    "mov v8.16b, v15.16b\n"
+    "ldr d4, [x3, #0x20]\n"
+    "mov v5.16b, v18.16b\n"
+    "ldp x28, x27, [x25, #0x0]\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "ldp x26, x13, [x25, #0x10]\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "ldp x24, x23, [x25, #0x20]\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "ldp x22, x21, [x25, #0x30]\n"
+    "ldp x20, x0, [x25, #0x40]\n"
+    "add x28, x28, x10\n"
+    "add x27, x27, x10\n"
+    "add x26, x26, x10\n"
+    "add x13, x13, x10\n"
+    "add x24, x24, x10\n"
+    "add x23, x23, x10\n"
+    "add x22, x22, x10\n"
+    "add x21, x21, x10\n"
+    "add x20, x20, x10\n"
+    "add x0, x0, x10\n"
+    "tbz x4, #2, 9f\n"
+    "ld1 { v31.s }[0], [x28], #0x4\n"
+    "ld1 { v30.s }[0], [x27], #0x4\n"
+    "ld1 { v29.s }[0], [x26], #0x4\n"
+    "ld1 { v28.s }[0], [x13], #0x4\n"
+    "ld1 { v27.s }[0], [x24], #0x4\n"
+    "ld1 { v23.s }[0], [x23], #0x4\n"
+    "ld1 { v25.s }[0], [x22], #0x4\n"
+    "ld1 { v24.s }[0], [x21], #0x4\n"
+    "ld1 { v26.s }[0], [x20], #0x4\n"
+    "ld1 { v22.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 8f\n"
+    "ld1 { v31.h }[2], [x28], #0x2\n"
+    "ld1 { v30.h }[2], [x27], #0x2\n"
+    "ld1 { v29.h }[2], [x26], #0x2\n"
+    "ld1 { v28.h }[2], [x13], #0x2\n"
+    "ld1 { v27.h }[2], [x24], #0x2\n"
+    "ld1 { v23.h }[2], [x23], #0x2\n"
+    "ld1 { v25.h }[2], [x22], #0x2\n"
+    "ld1 { v24.h }[2], [x21], #0x2\n"
+    "ld1 { v26.h }[2], [x20], #0x2\n"
+    "ld1 { v22.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[6], [x28]\n"
+    "ld1 { v30.b }[6], [x27]\n"
+    "ld1 { v29.b }[6], [x26]\n"
+    "ld1 { v28.b }[6], [x13]\n"
+    "ld1 { v27.b }[6], [x24]\n"
+    "ld1 { v23.b }[6], [x23]\n"
+    "ld1 { v25.b }[6], [x22]\n"
+    "ld1 { v24.b }[6], [x21]\n"
+    "ld1 { v26.b }[6], [x20]\n"
+    "ld1 { v22.b }[6], [x0]\n"
+    "b 11f\n"
+    "8:"  // Oddments: Initial loads: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[4], [x28]\n"
+    "ld1 { v30.b }[4], [x27]\n"
+    "ld1 { v29.b }[4], [x26]\n"
+    "ld1 { v28.b }[4], [x13]\n"
+    "ld1 { v27.b }[4], [x24]\n"
+    "ld1 { v23.b }[4], [x23]\n"
+    "ld1 { v25.b }[4], [x22]\n"
+    "ld1 { v24.b }[4], [x21]\n"
+    "ld1 { v26.b }[4], [x20]\n"
+    "ld1 { v22.b }[4], [x0]\n"
+    "b 11f\n"
+    "9:"  // Oddments: Initial loads: Bit 2: Unset
+    "tbz x4, #1, 10f\n"
+    "ld1 { v31.h }[0], [x28], #0x2\n"
+    "ld1 { v30.h }[0], [x27], #0x2\n"
+    "ld1 { v29.h }[0], [x26], #0x2\n"
+    "ld1 { v28.h }[0], [x13], #0x2\n"
+    "ld1 { v27.h }[0], [x24], #0x2\n"
+    "ld1 { v23.h }[0], [x23], #0x2\n"
+    "ld1 { v25.h }[0], [x22], #0x2\n"
+    "ld1 { v24.h }[0], [x21], #0x2\n"
+    "ld1 { v26.h }[0], [x20], #0x2\n"
+    "ld1 { v22.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[2], [x28]\n"
+    "ld1 { v30.b }[2], [x27]\n"
+    "ld1 { v29.b }[2], [x26]\n"
+    "ld1 { v28.b }[2], [x13]\n"
+    "ld1 { v27.b }[2], [x24]\n"
+    "ld1 { v23.b }[2], [x23]\n"
+    "ld1 { v25.b }[2], [x22]\n"
+    "ld1 { v24.b }[2], [x21]\n"
+    "ld1 { v26.b }[2], [x20]\n"
+    "ld1 { v22.b }[2], [x0]\n"
+    "b 11f\n"
+    "10:"  // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 11f\n"
+    "ld1 { v31.b }[0], [x28]\n"
+    "ld1 { v30.b }[0], [x27]\n"
+    "ld1 { v29.b }[0], [x26]\n"
+    "ld1 { v28.b }[0], [x13]\n"
+    "ld1 { v27.b }[0], [x24]\n"
+    "ld1 { v23.b }[0], [x23]\n"
+    "ld1 { v25.b }[0], [x22]\n"
+    "ld1 { v24.b }[0], [x21]\n"
+    "ld1 { v26.b }[0], [x20]\n"
+    "ld1 { v22.b }[0], [x0]\n"
+    "11:"  // Oddments: Initial loads: Bit 2: End
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ldr x20, [x25, #0x50]\n"
+    "add x20, x20, x10\n"
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "usubl v29.8h, v29.8b, v9.8b\n"
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "usubl v22.8h, v22.8b, v9.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "smlal v16.4s, v30.4h, v0.4h\n"
+    "smlal2 v21.4s, v30.8h, v0.8h\n"
+    "smlal v7.4s, v29.4h, v0.4h\n"
+    "smlal2 v17.4s, v29.8h, v0.8h\n"
+    "smlal v8.4s, v28.4h, v0.4h\n"
+    "smlal2 v5.4s, v28.8h, v0.8h\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v18.4s, v30.8h, v1.8h\n"
+    "smlal v16.4s, v27.4h, v1.4h\n"
+    "smlal2 v21.4s, v27.8h, v1.8h\n"
+    "smlal v7.4s, v28.4h, v1.4h\n"
+    "smlal2 v17.4s, v28.8h, v1.8h\n"
+    "smlal v8.4s, v23.4h, v1.4h\n"
+    "smlal2 v5.4s, v23.8h, v1.8h\n"
+    "smlal v15.4s, v27.4h, v2.4h\n"
+    "smlal2 v18.4s, v27.8h, v2.8h\n"
+    "smlal v16.4s, v25.4h, v2.4h\n"
+    "smlal2 v21.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v23.4h, v2.4h\n"
+    "smlal2 v17.4s, v23.8h, v2.8h\n"
+    "tbz x4, #2, 13f\n"
+    "ld1 { v31.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 12f\n"
+    "ld1 { v31.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[6], [x20]\n"
+    "b 15f\n"
+    "12:"  // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[4], [x20]\n"
+    "b 15f\n"
+    "13:"  // Oddments: Load (1, 3): Bit 2: Unset
+    "tbz x4, #1, 14f\n"
+    "ld1 { v31.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[2], [x20]\n"
+    "b 15f\n"
+    "14:"  // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 15f\n"
+    "ld1 { v31.b }[0], [x20]\n"
+    "15:"  // Oddments: Load (1, 3): Bit 2: End
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ldr x28, [x25, #0x58]\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "add x28, x28, x10\n"
+    "smlal v8.4s, v31.4h, v2.4h\n"
+    "smlal2 v5.4s, v31.8h, v2.8h\n"
+    "smlal2 v18.4s, v25.8h, v3.8h\n"
+    "smlal v16.4s, v24.4h, v3.4h\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "smlal v7.4s, v31.4h, v3.4h\n"
+    "smlal2 v17.4s, v31.8h, v3.8h\n"
+    "tbz x4, #2, 17f\n"
+    "ld1 { v30.s }[0], [x28], #0x4\n"
+    "tbz x4, #1, 16f\n"
+    "ld1 { v30.h }[2], [x28], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[6], [x28]\n"
+    "b 19f\n"
+    "16:"  // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[4], [x28]\n"
+    "b 19f\n"
+    "17:"  // Oddments: Load (1, 4): Bit 2: Unset
+    "tbz x4, #1, 18f\n"
+    "ld1 { v30.h }[0], [x28], #0x2\n"
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[2], [x28]\n"
+    "b 19f\n"
+    "18:"  // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 19f\n"
+    "ld1 { v30.b }[0], [x28]\n"
+    "19:"  // Oddments: Load (1, 4): Bit 2: End
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ldr x0, [x25, #0x60]\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "add x0, x0, x10\n"
+    "smlal v8.4s, v30.4h, v3.4h\n"
+    "smlal2 v5.4s, v30.8h, v3.8h\n"
+    "smlal2 v18.4s, v24.8h, v4.8h\n"
+    "tbz x4, #2, 21f\n"
+    "ld1 { v27.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 20f\n"
+    "ld1 { v27.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[6], [x0]\n"
+    "b 23f\n"
+    "20:"  // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[4], [x0]\n"
+    "b 23f\n"
+    "21:"  // Oddments: Load (0, 5): Bit 2: Unset
+    "tbz x4, #1, 22f\n"
+    "ld1 { v27.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[2], [x0]\n"
+    "b 23f\n"
+    "22:"  // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 23f\n"
+    "ld1 { v27.b }[0], [x0]\n"
+    "23:"  // Oddments: Load (0, 5): Bit 2: End
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "ldr d0, [x3, #0x28]\n"
+    "smlal v7.4s, v30.4h, v4.4h\n"
+    "ldr x7, [x25, #0x68]\n"
+    "add x7, x7, x10\n"
+    "smlal v16.4s, v27.4h, v4.4h\n"
+    "smlal2 v21.4s, v27.8h, v4.8h\n"
+    "smlal2 v17.4s, v30.8h, v4.8h\n"
+    "smlal v8.4s, v26.4h, v4.4h\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "smlal v15.4s, v29.4h, v0.4h\n"
+    "smlal2 v18.4s, v29.8h, v0.8h\n"
+    "smlal v16.4s, v28.4h, v0.4h\n"
+    "smlal2 v21.4s, v28.8h, v0.8h\n"
+    "smlal v7.4s, v22.4h, v0.4h\n"
+    "smlal2 v17.4s, v22.8h, v0.8h\n"
+    "tbz x4, #2, 25f\n"
+    "ld1 { v25.s }[0], [x7], #0x4\n"
+    "tbz x4, #1, 24f\n"
+    "ld1 { v25.h }[2], [x7], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[6], [x7]\n"
+    "b 27f\n"
+    "24:"  // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[4], [x7]\n"
+    "b 27f\n"
+    "25:"  // Oddments: Load (2, 1): Bit 2: Unset
+    "tbz x4, #1, 26f\n"
+    "ld1 { v25.h }[0], [x7], #0x2\n"
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[2], [x7]\n"
+    "b 27f\n"
+    "26:"  // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 27f\n"
+    "ld1 { v25.b }[0], [x7]\n"
+    "27:"  // Oddments: Load (2, 1): Bit 2: End
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "ldr d1, [x3, #0x30]\n"
+    "smlal v8.4s, v25.4h, v0.4h\n"
+    "ldr x26, [x25, #0x70]\n"
+    "add x26, x26, x10\n"
+    "smlal2 v5.4s, v25.8h, v0.8h\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "smlal v15.4s, v28.4h, v1.4h\n"
+    "smlal2 v18.4s, v28.8h, v1.8h\n"
+    "smlal v16.4s, v23.4h, v1.4h\n"
+    "smlal2 v21.4s, v23.8h, v1.8h\n"
+    "smlal v7.4s, v25.4h, v1.4h\n"
+    "smlal2 v17.4s, v25.8h, v1.8h\n"
+    "tbz x4, #2, 29f\n"
+    "ld1 { v24.s }[0], [x26], #0x4\n"
+    "tbz x4, #1, 28f\n"
+    "ld1 { v24.h }[2], [x26], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[6], [x26]\n"
+    "b 31f\n"
+    "28:"  // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[4], [x26]\n"
+    "b 31f\n"
+    "29:"  // Oddments: Load (2, 2): Bit 2: Unset
+    "tbz x4, #1, 30f\n"
+    "ld1 { v24.h }[0], [x26], #0x2\n"
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[2], [x26]\n"
+    "b 31f\n"
+    "30:"  // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 31f\n"
+    "ld1 { v24.b }[0], [x26]\n"
+    "31:"  // Oddments: Load (2, 2): Bit 2: End
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ldr d2, [x3, #0x38]\n"
+    "smlal v8.4s, v24.4h, v1.4h\n"
+    "ldr x23, [x25, #0x78]\n"
+    "add x23, x23, x10\n"
+    "smlal2 v5.4s, v24.8h, v1.8h\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "smlal v15.4s, v23.4h, v2.4h\n"
+    "smlal2 v18.4s, v23.8h, v2.8h\n"
+    "smlal v16.4s, v31.4h, v2.4h\n"
+    "smlal2 v21.4s, v31.8h, v2.8h\n"
+    "smlal v7.4s, v24.4h, v2.4h\n"
+    "smlal2 v17.4s, v24.8h, v2.8h\n"
+    "tbz x4, #2, 33f\n"
+    "ld1 { v27.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 32f\n"
+    "ld1 { v27.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[6], [x23]\n"
+    "b 35f\n"
+    "32:"  // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[4], [x23]\n"
+    "b 35f\n"
+    "33:"  // Oddments: Load (2, 3): Bit 2: Unset
+    "tbz x4, #1, 34f\n"
+    "ld1 { v27.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[2], [x23]\n"
+    "b 35f\n"
+    "34:"  // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 35f\n"
+    "ld1 { v27.b }[0], [x23]\n"
+    "35:"  // Oddments: Load (2, 3): Bit 2: End
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "ldr d3, [x3, #0x40]\n"
+    "smlal v8.4s, v27.4h, v2.4h\n"
+    "ldr x20, [x25, #0x80]\n"
+    "add x20, x20, x10\n"
+    "smlal2 v5.4s, v27.8h, v2.8h\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "smlal v15.4s, v31.4h, v3.4h\n"
+    "smlal2 v18.4s, v31.8h, v3.8h\n"
+    "smlal v16.4s, v30.4h, v3.4h\n"
+    "smlal2 v21.4s, v30.8h, v3.8h\n"
+    "smlal v7.4s, v27.4h, v3.4h\n"
+    "smlal2 v17.4s, v27.8h, v3.8h\n"
+    "tbz x4, #2, 37f\n"
+    "ld1 { v23.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 36f\n"
+    "ld1 { v23.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[6], [x20]\n"
+    "b 39f\n"
+    "36:"  // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[4], [x20]\n"
+    "b 39f\n"
+    "37:"  // Oddments: Load (2, 4): Bit 2: Unset
+    "tbz x4, #1, 38f\n"
+    "ld1 { v23.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[2], [x20]\n"
+    "b 39f\n"
+    "38:"  // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 39f\n"
+    "ld1 { v23.b }[0], [x20]\n"
+    "39:"  // Oddments: Load (2, 4): Bit 2: End
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "ldr d4, [x3, #0x48]\n"
+    "smlal v8.4s, v23.4h, v3.4h\n"
+    "ldr x22, [x25, #0x88]\n"
+    "add x22, x22, x10\n"
+    "smlal2 v5.4s, v23.8h, v3.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal v15.4s, v30.4h, v4.4h\n"
+    "smlal2 v18.4s, v30.8h, v4.8h\n"
+    "smlal v16.4s, v26.4h, v4.4h\n"
+    "smlal2 v21.4s, v26.8h, v4.8h\n"
+    "smlal v7.4s, v23.4h, v4.4h\n"
+    "smlal2 v17.4s, v23.8h, v4.8h\n"
+    "tbz x4, #2, 41f\n"
+    "ld1 { v28.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 40f\n"
+    "ld1 { v28.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[6], [x22]\n"
+    "b 43f\n"
+    "40:"  // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[4], [x22]\n"
+    "b 43f\n"
+    "41:"  // Oddments: Load (2, 5): Bit 2: Unset
+    "tbz x4, #1, 42f\n"
+    "ld1 { v28.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[2], [x22]\n"
+    "b 43f\n"
+    "42:"  // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 43f\n"
+    "ld1 { v28.b }[0], [x22]\n"
+    "43:"  // Oddments: Load (2, 5): Bit 2: End
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "ldr d0, [x3, #0x50]\n"
+    "smlal v8.4s, v28.4h, v4.4h\n"
+    "ldr x13, [x25, #0x90]\n"
+    "add x13, x13, x10\n"
+    "smlal2 v5.4s, v28.8h, v4.8h\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "smlal v15.4s, v22.4h, v0.4h\n"
+    "smlal2 v18.4s, v22.8h, v0.8h\n"
+    "smlal v16.4s, v25.4h, v0.4h\n"
+    "smlal2 v21.4s, v25.8h, v0.8h\n"
+    "tbz x4, #2, 45f\n"
+    "ld1 { v31.s }[0], [x13], #0x4\n"
+    "tbz x4, #1, 44f\n"
+    "ld1 { v31.h }[2], [x13], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[6], [x13]\n"
+    "b 47f\n"
+    "44:"  // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[4], [x13]\n"
+    "b 47f\n"
+    "45:"  // Oddments: Load (3, 0): Bit 2: Unset
+    "tbz x4, #1, 46f\n"
+    "ld1 { v31.h }[0], [x13], #0x2\n"
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[2], [x13]\n"
+    "b 47f\n"
+    "46:"  // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 47f\n"
+    "ld1 { v31.b }[0], [x13]\n"
+    "47:"  // Oddments: Load (3, 0): Bit 2: End
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ldr x21, [x25, #0x98]\n"
+    "smlal v7.4s, v31.4h, v0.4h\n"
+    "add x21, x21, x10\n"
+    "smlal2 v17.4s, v31.8h, v0.8h\n"
+    "tbz x4, #2, 49f\n"
+    "ld1 { v30.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 48f\n"
+    "ld1 { v30.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[6], [x21]\n"
+    "b 51f\n"
+    "48:"  // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[4], [x21]\n"
+    "b 51f\n"
+    "49:"  // Oddments: Load (3, 1): Bit 2: Unset
+    "tbz x4, #1, 50f\n"
+    "ld1 { v30.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[2], [x21]\n"
+    "b 51f\n"
+    "50:"  // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 51f\n"
+    "ld1 { v30.b }[0], [x21]\n"
+    "51:"  // Oddments: Load (3, 1): Bit 2: End
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ldr d1, [x3, #0x58]\n"
+    "smlal v8.4s, v30.4h, v0.4h\n"
+    "ldr x14, [x25, #0xa0]\n"
+    "add x14, x14, x10\n"
+    "smlal2 v5.4s, v30.8h, v0.8h\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "smlal v15.4s, v25.4h, v1.4h\n"
+    "smlal2 v18.4s, v25.8h, v1.8h\n"
+    "smlal v16.4s, v24.4h, v1.4h\n"
+    "smlal2 v21.4s, v24.8h, v1.8h\n"
+    "smlal v7.4s, v30.4h, v1.4h\n"
+    "smlal2 v17.4s, v30.8h, v1.8h\n"
+    "tbz x4, #2, 53f\n"
+    "ld1 { v26.s }[0], [x14], #0x4\n"
+    "tbz x4, #1, 52f\n"
+    "ld1 { v26.h }[2], [x14], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[6], [x14]\n"
+    "b 55f\n"
+    "52:"  // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[4], [x14]\n"
+    "b 55f\n"
+    "53:"  // Oddments: Load (3, 2): Bit 2: Unset
+    "tbz x4, #1, 54f\n"
+    "ld1 { v26.h }[0], [x14], #0x2\n"
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[2], [x14]\n"
+    "b 55f\n"
+    "54:"  // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 55f\n"
+    "ld1 { v26.b }[0], [x14]\n"
+    "55:"  // Oddments: Load (3, 2): Bit 2: End
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "ldr d2, [x3, #0x60]\n"
+    "smlal v8.4s, v26.4h, v1.4h\n"
+    "ldr x11, [x25, #0xa8]\n"
+    "add x11, x11, x10\n"
+    "smlal2 v5.4s, v26.8h, v1.8h\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "smlal v15.4s, v24.4h, v2.4h\n"
+    "smlal2 v18.4s, v24.8h, v2.8h\n"
+    "smlal v16.4s, v27.4h, v2.4h\n"
+    "smlal2 v21.4s, v27.8h, v2.8h\n"
+    "smlal v7.4s, v26.4h, v2.4h\n"
+    "smlal2 v17.4s, v26.8h, v2.8h\n"
+    "tbz x4, #2, 57f\n"
+    "ld1 { v25.s }[0], [x11], #0x4\n"
+    "tbz x4, #1, 56f\n"
+    "ld1 { v25.h }[2], [x11], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[6], [x11]\n"
+    "b 59f\n"
+    "56:"  // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[4], [x11]\n"
+    "b 59f\n"
+    "57:"  // Oddments: Load (3, 3): Bit 2: Unset
+    "tbz x4, #1, 58f\n"
+    "ld1 { v25.h }[0], [x11], #0x2\n"
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[2], [x11]\n"
+    "b 59f\n"
+    "58:"  // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 59f\n"
+    "ld1 { v25.b }[0], [x11]\n"
+    "59:"  // Oddments: Load (3, 3): Bit 2: End
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "ldr d3, [x3, #0x68]\n"
+    "smlal v8.4s, v25.4h, v2.4h\n"
+    "ldr x24, [x25, #0xb0]\n"
+    "add x24, x24, x10\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "smlal v15.4s, v27.4h, v3.4h\n"
+    "smlal2 v18.4s, v27.8h, v3.8h\n"
+    "smlal v16.4s, v23.4h, v3.4h\n"
+    "smlal2 v21.4s, v23.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v3.4h\n"
+    "smlal2 v17.4s, v25.8h, v3.8h\n"
+    "tbz x4, #2, 61f\n"
+    "ld1 { v24.s }[0], [x24], #0x4\n"
+    "tbz x4, #1, 60f\n"
+    "ld1 { v24.h }[2], [x24], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[6], [x24]\n"
+    "b 63f\n"
+    "60:"  // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[4], [x24]\n"
+    "b 63f\n"
+    "61:"  // Oddments: Load (3, 4): Bit 2: Unset
+    "tbz x4, #1, 62f\n"
+    "ld1 { v24.h }[0], [x24], #0x2\n"
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[2], [x24]\n"
+    "b 63f\n"
+    "62:"  // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 63f\n"
+    "ld1 { v24.b }[0], [x24]\n"
+    "63:"  // Oddments: Load (3, 4): Bit 2: End
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ldr d4, [x3, #0x70]\n"
+    "smlal v8.4s, v24.4h, v3.4h\n"
+    "ldr x0, [x25, #0xb8]\n"
+    "add x0, x0, x10\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal v15.4s, v23.4h, v4.4h\n"
+    "smlal2 v18.4s, v23.8h, v4.8h\n"
+    "smlal v16.4s, v28.4h, v4.4h\n"
+    "smlal2 v21.4s, v28.8h, v4.8h\n"
+    "smlal v7.4s, v24.4h, v4.4h\n"
+    "smlal2 v17.4s, v24.8h, v4.8h\n"
+    "tbz x4, #2, 65f\n"
+    "ld1 { v22.s }[0], [x0], #0x4\n"
+    "tbz x4, #1, 64f\n"
+    "ld1 { v22.h }[2], [x0], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[6], [x0]\n"
+    "b 67f\n"
+    "64:"  // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[4], [x0]\n"
+    "b 67f\n"
+    "65:"  // Oddments: Load (3, 5): Bit 2: Unset
+    "tbz x4, #1, 66f\n"
+    "ld1 { v22.h }[0], [x0], #0x2\n"
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[2], [x0]\n"
+    "b 67f\n"
+    "66:"  // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 67f\n"
+    "ld1 { v22.b }[0], [x0]\n"
+    "67:"  // Oddments: Load (3, 5): Bit 2: End
+    "usubl v22.8h, v22.8b, v9.8b\n"
+    "ldr d0, [x3, #0x78]\n"
+    "smlal v8.4s, v22.4h, v4.4h\n"
+    "ldr x15, [x25, #0xc0]\n"
+    "add x15, x15, x10\n"
+    "smlal2 v5.4s, v22.8h, v4.8h\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "smlal v15.4s, v31.4h, v0.4h\n"
+    "smlal2 v18.4s, v31.8h, v0.8h\n"
+    "smlal v16.4s, v30.4h, v0.4h\n"
+    "smlal2 v21.4s, v30.8h, v0.8h\n"
+    "tbz x4, #2, 69f\n"
+    "ld1 { v27.s }[0], [x15], #0x4\n"
+    "tbz x4, #1, 68f\n"
+    "ld1 { v27.h }[2], [x15], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[6], [x15]\n"
+    "b 71f\n"
+    "68:"  // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[4], [x15]\n"
+    "b 71f\n"
+    "69:"  // Oddments: Load (4, 0): Bit 2: Unset
+    "tbz x4, #1, 70f\n"
+    "ld1 { v27.h }[0], [x15], #0x2\n"
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[2], [x15]\n"
+    "b 71f\n"
+    "70:"  // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 71f\n"
+    "ld1 { v27.b }[0], [x15]\n"
+    "71:"  // Oddments: Load (4, 0): Bit 2: End
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "ldr x9, [x25, #0xc8]\n"
+    "smlal v7.4s, v27.4h, v0.4h\n"
+    "add x9, x9, x10\n"
+    "smlal2 v17.4s, v27.8h, v0.8h\n"
+    "tbz x4, #2, 73f\n"
+    "ld1 { v23.s }[0], [x9], #0x4\n"
+    "tbz x4, #1, 72f\n"
+    "ld1 { v23.h }[2], [x9], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[6], [x9]\n"
+    "b 75f\n"
+    "72:"  // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[4], [x9]\n"
+    "b 75f\n"
+    "73:"  // Oddments: Load (4, 1): Bit 2: Unset
+    "tbz x4, #1, 74f\n"
+    "ld1 { v23.h }[0], [x9], #0x2\n"
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[2], [x9]\n"
+    "b 75f\n"
+    "74:"  // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 75f\n"
+    "ld1 { v23.b }[0], [x9]\n"
+    "75:"  // Oddments: Load (4, 1): Bit 2: End
+    "usubl v23.8h, v23.8b, v9.8b\n"
+    "ldr d1, [x3, #0x80]\n"
+    "smlal v8.4s, v23.4h, v0.4h\n"
+    "ldr x27, [x25, #0xd0]\n"
+    "add x27, x27, x10\n"
+    "smlal2 v5.4s, v23.8h, v0.8h\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "smlal v15.4s, v30.4h, v1.4h\n"
+    "smlal2 v18.4s, v30.8h, v1.8h\n"
+    "smlal v16.4s, v26.4h, v1.4h\n"
+    "smlal2 v21.4s, v26.8h, v1.8h\n"
+    "smlal v7.4s, v23.4h, v1.4h\n"
+    "smlal2 v17.4s, v23.8h, v1.8h\n"
+    "tbz x4, #2, 77f\n"
+    "ld1 { v31.s }[0], [x27], #0x4\n"
+    "tbz x4, #1, 76f\n"
+    "ld1 { v31.h }[2], [x27], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[6], [x27]\n"
+    "b 79f\n"
+    "76:"  // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[4], [x27]\n"
+    "b 79f\n"
+    "77:"  // Oddments: Load (4, 2): Bit 2: Unset
+    "tbz x4, #1, 78f\n"
+    "ld1 { v31.h }[0], [x27], #0x2\n"
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[2], [x27]\n"
+    "b 79f\n"
+    "78:"  // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 79f\n"
+    "ld1 { v31.b }[0], [x27]\n"
+    "79:"  // Oddments: Load (4, 2): Bit 2: End
+    "usubl v31.8h, v31.8b, v9.8b\n"
+    "ldr d2, [x3, #0x88]\n"
+    "smlal v8.4s, v31.4h, v1.4h\n"
+    "ldr x28, [x25, #0xd8]\n"
+    "add x28, x28, x10\n"
+    "smlal2 v5.4s, v31.8h, v1.8h\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "smlal v15.4s, v26.4h, v2.4h\n"
+    "smlal2 v18.4s, v26.8h, v2.8h\n"
+    "smlal v16.4s, v25.4h, v2.4h\n"
+    "smlal2 v21.4s, v25.8h, v2.8h\n"
+    "smlal v7.4s, v31.4h, v2.4h\n"
+    "smlal2 v17.4s, v31.8h, v2.8h\n"
+    "tbz x4, #2, 81f\n"
+    "ld1 { v30.s }[0], [x28], #0x4\n"
+    "tbz x4, #1, 80f\n"
+    "ld1 { v30.h }[2], [x28], #0x2\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[6], [x28]\n"
+    "b 83f\n"
+    "80:"  // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[4], [x28]\n"
+    "b 83f\n"
+    "81:"  // Oddments: Load (4, 3): Bit 2: Unset
+    "tbz x4, #1, 82f\n"
+    "ld1 { v30.h }[0], [x28], #0x2\n"
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[2], [x28]\n"
+    "b 83f\n"
+    "82:"  // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 83f\n"
+    "ld1 { v30.b }[0], [x28]\n"
+    "83:"  // Oddments: Load (4, 3): Bit 2: End
+    "usubl v30.8h, v30.8b, v9.8b\n"
+    "ldr d3, [x3, #0x90]\n"
+    "smlal v8.4s, v30.4h, v2.4h\n"
+    "ldr x12, [x25, #0xe0]\n"
+    "add x12, x12, x10\n"
+    "smlal2 v5.4s, v30.8h, v2.8h\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "smlal v15.4s, v25.4h, v3.4h\n"
+    "smlal2 v18.4s, v25.8h, v3.8h\n"
+    "smlal v16.4s, v24.4h, v3.4h\n"
+    "smlal2 v21.4s, v24.8h, v3.8h\n"
+    "smlal v7.4s, v30.4h, v3.4h\n"
+    "smlal2 v17.4s, v30.8h, v3.8h\n"
+    "tbz x4, #2, 85f\n"
+    "ld1 { v28.s }[0], [x12], #0x4\n"
+    "tbz x4, #1, 84f\n"
+    "ld1 { v28.h }[2], [x12], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[6], [x12]\n"
+    "b 87f\n"
+    "84:"  // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[4], [x12]\n"
+    "b 87f\n"
+    "85:"  // Oddments: Load (4, 4): Bit 2: Unset
+    "tbz x4, #1, 86f\n"
+    "ld1 { v28.h }[0], [x12], #0x2\n"
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[2], [x12]\n"
+    "b 87f\n"
+    "86:"  // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 87f\n"
+    "ld1 { v28.b }[0], [x12]\n"
+    "87:"  // Oddments: Load (4, 4): Bit 2: End
+    "usubl v28.8h, v28.8b, v9.8b\n"
+    "ldr d4, [x3, #0x98]\n"
+    "smlal v8.4s, v28.4h, v3.4h\n"
+    "ldr x7, [x25, #0xe8]\n"
+    "add x7, x7, x10\n"
+    "smlal2 v5.4s, v28.8h, v3.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal v15.4s, v24.4h, v4.4h\n"
+    "smlal2 v18.4s, v24.8h, v4.8h\n"
+    "smlal v16.4s, v22.4h, v4.4h\n"
+    "smlal2 v21.4s, v22.8h, v4.8h\n"
+    "smlal v7.4s, v28.4h, v4.4h\n"
+    "smlal2 v17.4s, v28.8h, v4.8h\n"
+    "tbz x4, #2, 89f\n"
+    "ld1 { v26.s }[0], [x7], #0x4\n"
+    "tbz x4, #1, 88f\n"
+    "ld1 { v26.h }[2], [x7], #0x2\n"
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[6], [x7]\n"
+    "b 91f\n"
+    "88:"  // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[4], [x7]\n"
+    "b 91f\n"
+    "89:"  // Oddments: Load (4, 5): Bit 2: Unset
+    "tbz x4, #1, 90f\n"
+    "ld1 { v26.h }[0], [x7], #0x2\n"
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[2], [x7]\n"
+    "b 91f\n"
+    "90:"  // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 91f\n"
+    "ld1 { v26.b }[0], [x7]\n"
+    "91:"  // Oddments: Load (4, 5): Bit 2: End
+    "usubl v26.8h, v26.8b, v9.8b\n"
+    "ldr d0, [x3, #0xa0]\n"
+    "smlal v8.4s, v26.4h, v4.4h\n"
+    "ldr x26, [x25, #0xf0]\n"
+    "add x26, x26, x10\n"
+    "smlal2 v5.4s, v26.8h, v4.8h\n"
+    "ssubl v0.8h, v0.8b, v14.8b\n"
+    "smlal v15.4s, v27.4h, v0.4h\n"
+    "smlal2 v18.4s, v27.8h, v0.8h\n"
+    "smlal v16.4s, v23.4h, v0.4h\n"
+    "smlal2 v21.4s, v23.8h, v0.8h\n"
+    "tbz x4, #2, 93f\n"
+    "ld1 { v25.s }[0], [x26], #0x4\n"
+    "tbz x4, #1, 92f\n"
+    "ld1 { v25.h }[2], [x26], #0x2\n"
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[6], [x26]\n"
+    "b 95f\n"
+    "92:"  // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[4], [x26]\n"
+    "b 95f\n"
+    "93:"  // Oddments: Load (5, 0): Bit 2: Unset
+    "tbz x4, #1, 94f\n"
+    "ld1 { v25.h }[0], [x26], #0x2\n"
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[2], [x26]\n"
+    "b 95f\n"
+    "94:"  // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 95f\n"
+    "ld1 { v25.b }[0], [x26]\n"
+    "95:"  // Oddments: Load (5, 0): Bit 2: End
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "ldr x23, [x25, #0xf8]\n"
+    "smlal v7.4s, v25.4h, v0.4h\n"
+    "add x23, x23, x10\n"
+    "smlal2 v17.4s, v25.8h, v0.8h\n"
+    "tbz x4, #2, 97f\n"
+    "ld1 { v24.s }[0], [x23], #0x4\n"
+    "tbz x4, #1, 96f\n"
+    "ld1 { v24.h }[2], [x23], #0x2\n"
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[6], [x23]\n"
+    "b 99f\n"
+    "96:"  // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[4], [x23]\n"
+    "b 99f\n"
+    "97:"  // Oddments: Load (5, 1): Bit 2: Unset
+    "tbz x4, #1, 98f\n"
+    "ld1 { v24.h }[0], [x23], #0x2\n"
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[2], [x23]\n"
+    "b 99f\n"
+    "98:"  // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 99f\n"
+    "ld1 { v24.b }[0], [x23]\n"
+    "99:"  // Oddments: Load (5, 1): Bit 2: End
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ldr d1, [x3, #0xa8]\n"
+    "smlal v8.4s, v24.4h, v0.4h\n"
+    "ldr x22, [x25, #0x100]\n"
+    "add x22, x22, x10\n"
+    "smlal2 v5.4s, v24.8h, v0.8h\n"
+    "ssubl v1.8h, v1.8b, v14.8b\n"
+    "smlal v15.4s, v23.4h, v1.4h\n"
+    "smlal2 v18.4s, v23.8h, v1.8h\n"
+    "smlal v16.4s, v31.4h, v1.4h\n"
+    "smlal2 v21.4s, v31.8h, v1.8h\n"
+    "smlal v7.4s, v24.4h, v1.4h\n"
+    "smlal2 v17.4s, v24.8h, v1.8h\n"
+    "tbz x4, #2, 101f\n"
+    "ld1 { v27.s }[0], [x22], #0x4\n"
+    "tbz x4, #1, 100f\n"
+    "ld1 { v27.h }[2], [x22], #0x2\n"
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[6], [x22]\n"
+    "b 103f\n"
+    "100:"  // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[4], [x22]\n"
+    "b 103f\n"
+    "101:"  // Oddments: Load (5, 2): Bit 2: Unset
+    "tbz x4, #1, 102f\n"
+    "ld1 { v27.h }[0], [x22], #0x2\n"
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[2], [x22]\n"
+    "b 103f\n"
+    "102:"  // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 103f\n"
+    "ld1 { v27.b }[0], [x22]\n"
+    "103:"  // Oddments: Load (5, 2): Bit 2: End
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "ldr d2, [x3, #0xb0]\n"
+    "smlal v8.4s, v27.4h, v1.4h\n"
+    "ldr x20, [x25, #0x108]\n"
+    "add x20, x20, x10\n"
+    "smlal2 v5.4s, v27.8h, v1.8h\n"
+    "ssubl v2.8h, v2.8b, v14.8b\n"
+    "smlal v15.4s, v31.4h, v2.4h\n"
+    "smlal2 v18.4s, v31.8h, v2.8h\n"
+    "smlal v16.4s, v30.4h, v2.4h\n"
+    "smlal2 v21.4s, v30.8h, v2.8h\n"
+    "smlal v7.4s, v27.4h, v2.4h\n"
+    "smlal2 v17.4s, v27.8h, v2.8h\n"
+    "tbz x4, #2, 105f\n"
+    "ld1 { v25.s }[0], [x20], #0x4\n"
+    "tbz x4, #1, 104f\n"
+    "ld1 { v25.h }[2], [x20], #0x2\n"
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[6], [x20]\n"
+    "b 107f\n"
+    "104:"  // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[4], [x20]\n"
+    "b 107f\n"
+    "105:"  // Oddments: Load (5, 3): Bit 2: Unset
+    "tbz x4, #1, 106f\n"
+    "ld1 { v25.h }[0], [x20], #0x2\n"
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[2], [x20]\n"
+    "b 107f\n"
+    "106:"  // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 107f\n"
+    "ld1 { v25.b }[0], [x20]\n"
+    "107:"  // Oddments: Load (5, 3): Bit 2: End
+    "usubl v25.8h, v25.8b, v9.8b\n"
+    "ldr d3, [x3, #0xb8]\n"
+    "smlal v8.4s, v25.4h, v2.4h\n"
+    "ldr x13, [x25, #0x110]\n"
+    "add x13, x13, x10\n"
+    "smlal2 v5.4s, v25.8h, v2.8h\n"
+    "ssubl v3.8h, v3.8b, v14.8b\n"
+    "smlal v15.4s, v30.4h, v3.4h\n"
+    "smlal2 v18.4s, v30.8h, v3.8h\n"
+    "smlal v16.4s, v28.4h, v3.4h\n"
+    "smlal2 v21.4s, v28.8h, v3.8h\n"
+    "smlal v7.4s, v25.4h, v3.4h\n"
+    "smlal2 v17.4s, v25.8h, v3.8h\n"
+    "tbz x4, #2, 109f\n"
+    "ld1 { v24.s }[0], [x13], #0x4\n"
+    "tbz x4, #1, 108f\n"
+    "ld1 { v24.h }[2], [x13], #0x2\n"
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[6], [x13]\n"
+    "b 111f\n"
+    "108:"  // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[4], [x13]\n"
+    "b 111f\n"
+    "109:"  // Oddments: Load (5, 4): Bit 2: Unset
+    "tbz x4, #1, 110f\n"
+    "ld1 { v24.h }[0], [x13], #0x2\n"
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[2], [x13]\n"
+    "b 111f\n"
+    "110:"  // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 111f\n"
+    "ld1 { v24.b }[0], [x13]\n"
+    "111:"  // Oddments: Load (5, 4): Bit 2: End
+    "usubl v24.8h, v24.8b, v9.8b\n"
+    "ldr d4, [x3, #0xc0]\n"
+    "smlal v8.4s, v24.4h, v3.4h\n"
+    "ldr x21, [x25, #0x118]\n"
+    "add x21, x21, x10\n"
+    "smlal2 v5.4s, v24.8h, v3.8h\n"
+    "ssubl v4.8h, v4.8b, v14.8b\n"
+    "smlal v15.4s, v28.4h, v4.4h\n"
+    "smlal2 v18.4s, v28.8h, v4.8h\n"
+    "smlal v16.4s, v26.4h, v4.4h\n"
+    "smlal2 v21.4s, v26.8h, v4.8h\n"
+    "smlal v7.4s, v24.4h, v4.4h\n"
+    "smlal2 v17.4s, v24.8h, v4.8h\n"
+    "tbz x4, #2, 113f\n"
+    "ld1 { v27.s }[0], [x21], #0x4\n"
+    "tbz x4, #1, 112f\n"
+    "ld1 { v27.h }[2], [x21], #0x2\n"
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[6], [x21]\n"
+    "b 115f\n"
+    "112:"  // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[4], [x21]\n"
+    "b 115f\n"
+    "113:"  // Oddments: Load (5, 5): Bit 2: Unset
+    "tbz x4, #1, 114f\n"
+    "ld1 { v27.h }[0], [x21], #0x2\n"
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[2], [x21]\n"
+    "b 115f\n"
+    "114:"  // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 115f\n"
+    "ld1 { v27.b }[0], [x21]\n"
+    "115:"  // Oddments: Load (5, 5): Bit 2: End
+    "usubl v27.8h, v27.8b, v9.8b\n"
+    "smlal v8.4s, v27.4h, v4.4h\n"
+    "smlal2 v5.4s, v27.8h, v4.8h\n"
+    "tbz x4, #2, 117f\n"
+    "ld1 { v6.4s }, [x2], #0x10\n"
+    "ld1 { v19.4s }, [x5], #0x10\n"
+    "tbz x4, #1, 116f\n"
+    "ld1 { v20.d }[0], [x2], #0x8\n"
+    "ld1 { v12.d }[0], [x5], #0x8\n"
+    "tbz x4, #0, 119f\n"
+    "ld1 { v20.s }[2], [x2]\n"
+    "ld1 { v12.s }[2], [x5]\n"
+    "b 119f\n"
+    "116:"  // Oddments: Load requant params: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 119f\n"
+    "ld1 { v20.s }[0], [x2]\n"
+    "ld1 { v12.s }[0], [x5]\n"
+    "b 119f\n"
+    "117:"  // Oddments: Load requant params: Bit 2: Unset
+    "tbz x4, #1, 118f\n"
+    "ld1 { v6.d }[0], [x2], #0x8\n"
+    "ld1 { v19.d }[0], [x5], #0x8\n"
+    "tbz x4, #0, 119f\n"
+    "ld1 { v6.s }[2], [x2]\n"
+    "ld1 { v19.s }[2], [x5]\n"
+    "b 119f\n"
+    "118:"  // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 119f\n"
+    "ld1 { v6.s }[0], [x2]\n"
+    "ld1 { v19.s }[0], [x5]\n"
+    "119:"  // Oddments: Load requant params: Bit 2: End
+    "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+    "add x17, x17, x1\n"
+    "sqrdmulh v18.4s, v18.4s, v20.4s\n"
+    "add x16, x16, x1\n"
+    "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+    "add x6, x6, x1\n"
+    "sqrdmulh v21.4s, v21.4s, v20.4s\n"
+    "add x8, x8, x1\n"
+    "sqrdmulh v7.4s, v7.4s, v6.4s\n"
+    "and v28.16b, v15.16b, v19.16b\n"
+    "and v26.16b, v18.16b, v12.16b\n"
+    "and v29.16b, v16.16b, v19.16b\n"
+    "sshr v28.4s, v28.4s, #0x1f\n"
+    "sshr v26.4s, v26.4s, #0x1f\n"
+    "sshr v29.4s, v29.4s, #0x1f\n"
+    "sqadd v15.4s, v15.4s, v28.4s\n"
+    "sqadd v18.4s, v18.4s, v26.4s\n"
+    "sqadd v16.4s, v16.4s, v29.4s\n"
+    "and v4.16b, v21.16b, v12.16b\n"
+    "srshl v15.4s, v15.4s, v19.4s\n"
+    "srshl v18.4s, v18.4s, v12.4s\n"
+    "srshl v16.4s, v16.4s, v19.4s\n"
+    "sshr v4.4s, v4.4s, #0x1f\n"
+    "add v15.4s, v15.4s, v10.4s\n"
+    "add v18.4s, v18.4s, v10.4s\n"
+    "add v16.4s, v16.4s, v10.4s\n"
+    "smin v15.4s, v15.4s, v13.4s\n"
+    "smin v18.4s, v18.4s, v13.4s\n"
+    "smin v16.4s, v16.4s, v13.4s\n"
+    "smax v15.4s, v15.4s, v11.4s\n"
+    "smax v18.4s, v18.4s, v11.4s\n"
+    "smax v16.4s, v16.4s, v11.4s\n"
+    "sqadd v21.4s, v21.4s, v4.4s\n"
+    "uzp1 v15.16b, v15.16b, v18.16b\n"
+    "and v25.16b, v7.16b, v19.16b\n"
+    "uzp1 v15.16b, v15.16b, v15.16b\n"
+    "srshl v21.4s, v21.4s, v12.4s\n"
+    "sshr v25.4s, v25.4s, #0x1f\n"
+    "sqrdmulh v17.4s, v17.4s, v20.4s\n"
+    "sqrdmulh v8.4s, v8.4s, v6.4s\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "sqadd v7.4s, v7.4s, v25.4s\n"
+    "and v31.16b, v17.16b, v12.16b\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "and v24.16b, v8.16b, v19.16b\n"
+    "srshl v7.4s, v7.4s, v19.4s\n"
+    "smax v21.4s, v21.4s, v11.4s\n"
+    "sshr v31.4s, v31.4s, #0x1f\n"
+    "sshr v24.4s, v24.4s, #0x1f\n"
+    "uzp1 v16.16b, v16.16b, v21.16b\n"
+    "add v7.4s, v7.4s, v10.4s\n"
+    "uzp1 v16.16b, v16.16b, v16.16b\n"
+    "sqadd v17.4s, v17.4s, v31.4s\n"
+    "smin v7.4s, v7.4s, v13.4s\n"
+    "sqadd v8.4s, v8.4s, v24.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v20.4s\n"
+    "smax v7.4s, v7.4s, v11.4s\n"
+    "srshl v17.4s, v17.4s, v12.4s\n"
+    "srshl v8.4s, v8.4s, v19.4s\n"
+    "and v1.16b, v5.16b, v12.16b\n"
+    "add v17.4s, v17.4s, v10.4s\n"
+    "add v8.4s, v8.4s, v10.4s\n"
+    "sshr v1.4s, v1.4s, #0x1f\n"
+    "smin v17.4s, v17.4s, v13.4s\n"
+    "smin v8.4s, v8.4s, v13.4s\n"
+    "sqadd v5.4s, v5.4s, v1.4s\n"
+    "smax v17.4s, v17.4s, v11.4s\n"
+    "smax v8.4s, v8.4s, v11.4s\n"
+    "srshl v5.4s, v5.4s, v12.4s\n"
+    "uzp1 v7.16b, v7.16b, v17.16b\n"
+    "uzp1 v7.16b, v7.16b, v7.16b\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "smax v5.4s, v5.4s, v11.4s\n"
+    "uzp1 v8.16b, v8.16b, v5.16b\n"
+    "uzp1 v8.16b, v8.16b, v8.16b\n"
+    "tbz x4, #2, 121f\n"
+    "st1 { v15.s }[0], [x17], #0x4\n"
+    "st1 { v16.s }[0], [x16], #0x4\n"
+    "st1 { v7.s }[0], [x6], #0x4\n"
+    "st1 { v8.s }[0], [x8], #0x4\n"
+    "tbz x4, #1, 120f\n"
+    "st1 { v15.h }[2], [x17], #0x2\n"
+    "st1 { v16.h }[2], [x16], #0x2\n"
+    "st1 { v7.h }[2], [x6], #0x2\n"
+    "st1 { v8.h }[2], [x8], #0x2\n"
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[6], [x17], #0x1\n"
+    "st1 { v16.b }[6], [x16], #0x1\n"
+    "st1 { v7.b }[6], [x6], #0x1\n"
+    "st1 { v8.b }[6], [x8], #0x1\n"
+    "b 123f\n"
+    "120:"  // Oddments: Bit 2: Bit 1: Unset
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[4], [x17], #0x1\n"
+    "st1 { v16.b }[4], [x16], #0x1\n"
+    "st1 { v7.b }[4], [x6], #0x1\n"
+    "st1 { v8.b }[4], [x8], #0x1\n"
+    "b 123f\n"
+    "121:"  // Oddments: Bit 2: Unset
+    "tbz x4, #1, 122f\n"
+    "st1 { v15.h }[0], [x17], #0x2\n"
+    "st1 { v16.h }[0], [x16], #0x2\n"
+    "st1 { v7.h }[0], [x6], #0x2\n"
+    "st1 { v8.h }[0], [x8], #0x2\n"
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[2], [x17], #0x1\n"
+    "st1 { v16.b }[2], [x16], #0x1\n"
+    "st1 { v7.b }[2], [x6], #0x1\n"
+    "st1 { v8.b }[2], [x8], #0x1\n"
+    "b 123f\n"
+    "122:"  // Oddments: Bit 2: Unset: Bit 1: Unset
+    "tbz x4, #0, 123f\n"
+    "st1 { v15.b }[0], [x17], #0x1\n"
+    "st1 { v16.b }[0], [x16], #0x1\n"
+    "st1 { v7.b }[0], [x6], #0x1\n"
+    "st1 { v8.b }[0], [x8], #0x1\n"
+    "123:"  // Oddments: Bit 2: End
+
+    "124:"  // End
+
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000..2bfeac0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+struct a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef int8_t weight_type;
+  typedef uint8_t return_type;
+
+  typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int n_output_points = 9;
+
+  kern_type kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+  a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..1633639
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp

@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  const arm_gemm::Requantize32& qp,
+  const unsigned int n_points,
+  const unsigned int n_channels
+)
+{
+  __asm__ __volatile__(
+    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v12.4s }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v11.4s }, [x20]\n"
+    "ld1r { v10.16b }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v9.16b }, [x20]\n"
+    "ld1r { v8.4s }, [x19]\n"
+    "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v7.4s }, [x20]\n"
+    "ld1r { v6.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "mov x11, #0x0\n"
+    "ld1r { v5.4s }, [x19]\n"
+    "lsr x10, %x[n_channels], #0x2\n"
+    "cbz x10, 6f\n"
+    "1:"  // Channel loop
+    "movi v27.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x19, x11, #0x2\n"
+    "ldr q27, [%x[bias], x19]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "mov v26.16b, v27.16b\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v25.16b, v27.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, %x[n_points], #0x1\n"
+    "mov v24.16b, v27.16b\n"
+    "ldr s4, [x9, x11]\n"
+    "mov v23.16b, v27.16b\n"
+    "mov v22.16b, v27.16b\n"
+    "ldr s3, [x28, x11]\n"
+    "mov v21.16b, v27.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v20.16b, v27.16b\n"
+    "ldr s2, [x27, x11]\n"
+    "mov v19.16b, v27.16b\n"
+    "ssubl v16.8h, v16.8b, v9.8b\n"
+    "ldr s1, [x26, x11]\n"
+    "usubl v4.8h, v4.8b, v10.8b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "usubl v3.8h, v3.8b, v10.8b\n"
+    "ldr s0, [x25, x11]\n"
+    "usubl v2.8h, v2.8b, v10.8b\n"
+    "usubl v1.8h, v1.8b, v10.8b\n"
+    "ldr s31, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "usubl v0.8h, v0.8b, v10.8b\n"
+    "ldr s30, [x23, x11]\n"
+    "ldr s29, [x22, x11]\n"
+    "usubl v31.8h, v31.8b, v10.8b\n"
+    "ldr x21, [x20], #0x8\n"
+    "usubl v30.8h, v30.8b, v10.8b\n"
+    "ldr s28, [x21, x11]\n"
+    "usubl v29.8h, v29.8b, v10.8b\n"
+    "usubl v28.8h, v28.8b, v10.8b\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "subs x19, x19, #0x1\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "ldr s4, [x9, x11]\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "ldr s3, [x28, x11]\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "ldr s2, [x27, x11]\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "usubl v4.8h, v4.8b, v10.8b\n"
+    "ldr s1, [x26, x11]\n"
+    "usubl v3.8h, v3.8b, v10.8b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "usubl v2.8h, v2.8b, v10.8b\n"
+    "ldr s0, [x25, x11]\n"
+    "ssubl v16.8h, v16.8b, v9.8b\n"
+    "usubl v1.8h, v1.8b, v10.8b\n"
+    "ldr s31, [x24, x11]\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "usubl v0.8h, v0.8b, v10.8b\n"
+    "ldr s30, [x23, x11]\n"
+    "ldr s29, [x22, x11]\n"
+    "usubl v31.8h, v31.8b, v10.8b\n"
+    "ldr x21, [x20], #0x8\n"
+    "usubl v30.8h, v30.8b, v10.8b\n"
+    "ldr s28, [x21, x11]\n"
+    "usubl v29.8h, v29.8b, v10.8b\n"
+    "usubl v28.8h, v28.8b, v10.8b\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "cbz %x[rq_mul_ptr], 5f\n"
+    "lsl x19, x11, #0x2\n"
+    "ldr q6, [%x[rq_mul_ptr], x19]\n"
+    "ldr q5, [%x[rq_right_shift_ptr], x19]\n"
+    "cbz %x[rq_left_shift_ptr], 5f\n"
+    "ldr q7, [%x[rq_left_shift_ptr], x19]\n"
+    "5:"  // Channel loop: Load quantisation parameters: Done
+    "sshl v27.4s, v27.4s, v7.4s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "sshl v26.4s, v26.4s, v7.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "sshl v25.4s, v25.4s, v7.4s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+    "sshl v24.4s, v24.4s, v7.4s\n"
+    "and v16.16b, v27.16b, v5.16b\n"
+    "and v18.16b, v26.16b, v5.16b\n"
+    "and v17.16b, v25.16b, v5.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sqadd v26.4s, v26.4s, v18.4s\n"
+    "sqadd v25.4s, v25.4s, v17.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+    "srshl v27.4s, v27.4s, v5.4s\n"
+    "srshl v26.4s, v26.4s, v5.4s\n"
+    "srshl v25.4s, v25.4s, v5.4s\n"
+    "and v16.16b, v24.16b, v5.16b\n"
+    "add v27.4s, v27.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v8.4s\n"
+    "add v25.4s, v25.4s, v8.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v27.4s, v27.4s, v12.4s\n"
+    "smax v26.4s, v26.4s, v12.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "smin v27.4s, v27.4s, v11.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smax v25.4s, v25.4s, v12.4s\n"
+    "srshl v24.4s, v24.4s, v5.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x27, x11]\n"
+    "add v24.4s, v24.4s, v8.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x26, x11]\n"
+    "smax v24.4s, v24.4s, v12.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x25, x11]\n"
+    "sshl v23.4s, v23.4s, v7.4s\n"
+    "sshl v22.4s, v22.4s, v7.4s\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sshl v21.4s, v21.4s, v7.4s\n"
+    "and v17.16b, v23.16b, v5.16b\n"
+    "and v16.16b, v22.16b, v5.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x24, x11]\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v21.16b, v5.16b\n"
+    "sshl v20.4s, v20.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v7.4s\n"
+    "srshl v23.4s, v23.4s, v5.4s\n"
+    "srshl v22.4s, v22.4s, v5.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+    "add v23.4s, v23.4s, v8.4s\n"
+    "add v22.4s, v22.4s, v8.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "and v17.16b, v20.16b, v5.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+    "smax v23.4s, v23.4s, v12.4s\n"
+    "srshl v21.4s, v21.4s, v5.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v19.16b, v5.16b\n"
+    "smin v23.4s, v23.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v8.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "smax v21.4s, v21.4s, v12.4s\n"
+    "srshl v20.4s, v20.4s, v5.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "add v20.4s, v20.4s, v8.4s\n"
+    "srshl v19.4s, v19.4s, v5.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x23, x11]\n"
+    "add v19.4s, v19.4s, v8.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x22, x11]\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x21, x11]\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x20, x11]\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x19, x11]\n"
+    "add x11, x11, #0x4\n"
+    "cmp x11, x10, LSL #2\n"
+    "blt 1b\n"
+    "6:"  // Oddments
+    "tst %x[n_channels], #0x3\n"
+    "beq 24f\n"
+    "movi v27.4s, #0x0\n"
+    "cbz %x[bias], 9f\n"
+    "add x19, %x[bias], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 7f\n"
+    "ld1 { v27.d }[0], [x19], #0x8\n"
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v27.s }[2], [x19], #0x4\n"
+    "b 8f\n"
+    "7:"  // Oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_channels], #0, 8f\n"
+    "ld1 { v27.s }[0], [x19], #0x4\n"
+    "8:"  // Oddments: Load bias: Bit 1: End
+
+    "9:"  // Oddments: Load bias: Done
+    "mov v26.16b, v27.16b\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "mov x20, %x[inptrs]\n"
+    "mov v25.16b, v27.16b\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add x9, x9, x11\n"
+    "mov v24.16b, v27.16b\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "mov v23.16b, v27.16b\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "mov v22.16b, v27.16b\n"
+    "add x28, x28, x11\n"
+    "mov v21.16b, v27.16b\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "mov v20.16b, v27.16b\n"
+    "add x27, x27, x11\n"
+    "mov v19.16b, v27.16b\n"
+    "ldr x21, [x20], #0x8\n"
+    "ssubl v16.8h, v16.8b, v9.8b\n"
+    "add x26, x26, x11\n"
+    "add x25, x25, x11\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 10f\n"
+    "ldr h4, [x9], #0x2\n"
+    "ldr h3, [x28], #0x2\n"
+    "ldr h2, [x27], #0x2\n"
+    "ldr h1, [x26], #0x2\n"
+    "ldr h0, [x25], #0x2\n"
+    "ldr h31, [x24], #0x2\n"
+    "ldr h30, [x23], #0x2\n"
+    "ldr h29, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 11f\n"
+    "ld1 { v4.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x28], #0x1\n"
+    "ld1 { v2.b }[2], [x27], #0x1\n"
+    "ld1 { v1.b }[2], [x26], #0x1\n"
+    "ld1 { v0.b }[2], [x25], #0x1\n"
+    "ld1 { v31.b }[2], [x24], #0x1\n"
+    "ld1 { v30.b }[2], [x23], #0x1\n"
+    "ld1 { v29.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "b 11f\n"
+    "10:"  // Oddments: Load: Bit 1: Unset
+    "tbz %x[n_channels], #0, 11f\n"
+    "ldr b4, [x9], #0x1\n"
+    "ldr b3, [x28], #0x1\n"
+    "ldr b2, [x27], #0x1\n"
+    "ldr b1, [x26], #0x1\n"
+    "ldr b0, [x25], #0x1\n"
+    "ldr b31, [x24], #0x1\n"
+    "ldr b30, [x23], #0x1\n"
+    "ldr b29, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "11:"  // Oddments: Load: Bit 1: End
+    "usubl v4.8h, v4.8b, v10.8b\n"
+    "subs x19, %x[n_points], #0x1\n"
+    "usubl v3.8h, v3.8b, v10.8b\n"
+    "usubl v2.8h, v2.8b, v10.8b\n"
+    "usubl v1.8h, v1.8b, v10.8b\n"
+    "usubl v0.8h, v0.8b, v10.8b\n"
+    "usubl v31.8h, v31.8b, v10.8b\n"
+    "usubl v30.8h, v30.8b, v10.8b\n"
+    "usubl v29.8h, v29.8b, v10.8b\n"
+    "usubl v28.8h, v28.8b, v10.8b\n"
+    "ble 15f\n"
+    "12:"  // Oddments: Planar loop
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "ldp x9, x28, [x20], #0x10\n"
+    "add x9, x9, x11\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "ldp x27, x26, [x20], #0x10\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "ldp x25, x24, [x20], #0x10\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "add x28, x28, x11\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "ldp x23, x22, [x20], #0x10\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "add x27, x27, x11\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "ldr x21, [x20], #0x8\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "add x26, x26, x11\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "ldr s16, [%x[params]], #0x4\n"
+    "add x25, x25, x11\n"
+    "ssubl v16.8h, v16.8b, v9.8b\n"
+    "add x24, x24, x11\n"
+    "add x23, x23, x11\n"
+    "add x22, x22, x11\n"
+    "add x21, x21, x11\n"
+    "tbz %x[n_channels], #1, 13f\n"
+    "ldr h4, [x9], #0x2\n"
+    "ldr h3, [x28], #0x2\n"
+    "ldr h2, [x27], #0x2\n"
+    "ldr h1, [x26], #0x2\n"
+    "ldr h0, [x25], #0x2\n"
+    "ldr h31, [x24], #0x2\n"
+    "ldr h30, [x23], #0x2\n"
+    "ldr h29, [x22], #0x2\n"
+    "ldr h28, [x21], #0x2\n"
+    "tbz %x[n_channels], #0, 14f\n"
+    "ld1 { v4.b }[2], [x9], #0x1\n"
+    "ld1 { v3.b }[2], [x28], #0x1\n"
+    "ld1 { v2.b }[2], [x27], #0x1\n"
+    "ld1 { v1.b }[2], [x26], #0x1\n"
+    "ld1 { v0.b }[2], [x25], #0x1\n"
+    "ld1 { v31.b }[2], [x24], #0x1\n"
+    "ld1 { v30.b }[2], [x23], #0x1\n"
+    "ld1 { v29.b }[2], [x22], #0x1\n"
+    "ld1 { v28.b }[2], [x21], #0x1\n"
+    "b 14f\n"
+    "13:"  // Oddments: Planar loop: Load: Bit 1: Unset
+    "tbz %x[n_channels], #0, 14f\n"
+    "ldr b4, [x9], #0x1\n"
+    "ldr b3, [x28], #0x1\n"
+    "ldr b2, [x27], #0x1\n"
+    "ldr b1, [x26], #0x1\n"
+    "ldr b0, [x25], #0x1\n"
+    "ldr b31, [x24], #0x1\n"
+    "ldr b30, [x23], #0x1\n"
+    "ldr b29, [x22], #0x1\n"
+    "ldr b28, [x21], #0x1\n"
+    "14:"  // Oddments: Planar loop: Load: Bit 1: End
+    "usubl v4.8h, v4.8b, v10.8b\n"
+    "subs x19, x19, #0x1\n"
+    "usubl v3.8h, v3.8b, v10.8b\n"
+    "usubl v2.8h, v2.8b, v10.8b\n"
+    "usubl v1.8h, v1.8b, v10.8b\n"
+    "usubl v0.8h, v0.8b, v10.8b\n"
+    "usubl v31.8h, v31.8b, v10.8b\n"
+    "usubl v30.8h, v30.8b, v10.8b\n"
+    "usubl v29.8h, v29.8b, v10.8b\n"
+    "usubl v28.8h, v28.8b, v10.8b\n"
+    "bgt 12b\n"
+    "15:"  // Oddments: Planar tail
+    "smlal v27.4s, v4.4h, v16.4h\n"
+    "smlal v26.4s, v3.4h, v16.4h\n"
+    "smlal v25.4s, v2.4h, v16.4h\n"
+    "smlal v24.4s, v1.4h, v16.4h\n"
+    "smlal v23.4s, v0.4h, v16.4h\n"
+    "smlal v22.4s, v31.4h, v16.4h\n"
+    "smlal v21.4s, v30.4h, v16.4h\n"
+    "smlal v20.4s, v29.4h, v16.4h\n"
+    "smlal v19.4s, v28.4h, v16.4h\n"
+    "cbz %x[rq_mul_ptr], 21f\n"
+    "add x21, %x[rq_mul_ptr], x11, LSL #2\n"
+    "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n"
+    "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n"
+    "tbz %x[n_channels], #1, 18f\n"
+    "ld1 { v6.d }[0], [x21], #0x8\n"
+    "ld1 { v5.d }[0], [x20], #0x8\n"
+    "cbz %x[rq_left_shift_ptr], 16f\n"
+    "ld1 { v7.d }[0], [x19], #0x8\n"
+    "16:"  // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v6.s }[2], [x21], #0x4\n"
+    "ld1 { v5.s }[2], [x20], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 17f\n"
+    "ld1 { v7.s }[2], [x19], #0x4\n"
+    "17:"  // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+    "b 20f\n"
+    "18:"  // Oddments: Load quantisation parameters: Bit 1: Unset
+    "tbz %x[n_channels], #0, 20f\n"
+    "ld1 { v6.s }[0], [x21], #0x4\n"
+    "ld1 { v5.s }[0], [x20], #0x4\n"
+    "cbz %x[rq_left_shift_ptr], 19f\n"
+    "ld1 { v7.s }[0], [x19], #0x4\n"
+    "19:"  // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+
+    "20:"  // Oddments: Load quantisation parameters: Bit 1: End
+
+    "21:"  // Oddments: Load quantisation parameters: Done
+    "sshl v27.4s, v27.4s, v7.4s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "add x27, x27, x11\n"
+    "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "sshl v26.4s, v26.4s, v7.4s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "add x26, x26, x11\n"
+    "sshl v25.4s, v25.4s, v7.4s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "sshl v24.4s, v24.4s, v7.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x25, x25, x11\n"
+    "and v16.16b, v27.16b, v5.16b\n"
+    "add x24, x24, x11\n"
+    "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+    "add x23, x23, x11\n"
+    "sqrdmulh v25.4s, v25.4s, v6.4s\n"
+    "add x22, x22, x11\n"
+    "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+    "add x21, x21, x11\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add x20, x20, x11\n"
+    "and v18.16b, v26.16b, v5.16b\n"
+    "add x19, x19, x11\n"
+    "and v17.16b, v25.16b, v5.16b\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v24.16b, v5.16b\n"
+    "srshl v27.4s, v27.4s, v5.4s\n"
+    "sqadd v26.4s, v26.4s, v18.4s\n"
+    "sqadd v25.4s, v25.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v27.4s, v27.4s, v8.4s\n"
+    "srshl v26.4s, v26.4s, v5.4s\n"
+    "srshl v25.4s, v25.4s, v5.4s\n"
+    "sqadd v24.4s, v24.4s, v16.4s\n"
+    "smax v27.4s, v27.4s, v12.4s\n"
+    "add v26.4s, v26.4s, v8.4s\n"
+    "add v25.4s, v25.4s, v8.4s\n"
+    "srshl v24.4s, v24.4s, v5.4s\n"
+    "smin v27.4s, v27.4s, v11.4s\n"
+    "smax v26.4s, v26.4s, v12.4s\n"
+    "smax v25.4s, v25.4s, v12.4s\n"
+    "add v24.4s, v24.4s, v8.4s\n"
+    "smin v26.4s, v26.4s, v11.4s\n"
+    "smin v25.4s, v25.4s, v11.4s\n"
+    "smax v24.4s, v24.4s, v12.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v24.4s, v24.4s, v11.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sshl v23.4s, v23.4s, v7.4s\n"
+    "sshl v22.4s, v22.4s, v7.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v6.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+    "sshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v20.4s, v20.4s, v7.4s\n"
+    "and v17.16b, v23.16b, v5.16b\n"
+    "and v16.16b, v22.16b, v5.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v20.4s, v20.4s, v6.4s\n"
+    "sqadd v23.4s, v23.4s, v17.4s\n"
+    "sqadd v22.4s, v22.4s, v16.4s\n"
+    "and v16.16b, v21.16b, v5.16b\n"
+    "and v17.16b, v20.16b, v5.16b\n"
+    "srshl v23.4s, v23.4s, v5.4s\n"
+    "srshl v22.4s, v22.4s, v5.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "add v23.4s, v23.4s, v8.4s\n"
+    "add v22.4s, v22.4s, v8.4s\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "sqadd v20.4s, v20.4s, v17.4s\n"
+    "smax v23.4s, v23.4s, v12.4s\n"
+    "smax v22.4s, v22.4s, v12.4s\n"
+    "srshl v21.4s, v21.4s, v5.4s\n"
+    "srshl v20.4s, v20.4s, v5.4s\n"
+    "smin v23.4s, v23.4s, v11.4s\n"
+    "smin v22.4s, v22.4s, v11.4s\n"
+    "add v21.4s, v21.4s, v8.4s\n"
+    "add v20.4s, v20.4s, v8.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smax v21.4s, v21.4s, v12.4s\n"
+    "smax v20.4s, v20.4s, v12.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smin v21.4s, v21.4s, v11.4s\n"
+    "smin v20.4s, v20.4s, v11.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "sshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v6.4s\n"
+    "and v16.16b, v19.16b, v5.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "srshl v19.4s, v19.4s, v5.4s\n"
+    "add v19.4s, v19.4s, v8.4s\n"
+    "smax v19.4s, v19.4s, v12.4s\n"
+    "smin v19.4s, v19.4s, v11.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz %x[n_channels], #1, 22f\n"
+    "st1 { v27.h }[0], [x27], #0x2\n"
+    "st1 { v26.h }[0], [x26], #0x2\n"
+    "st1 { v25.h }[0], [x25], #0x2\n"
+    "st1 { v24.h }[0], [x24], #0x2\n"
+    "st1 { v23.h }[0], [x23], #0x2\n"
+    "st1 { v22.h }[0], [x22], #0x2\n"
+    "st1 { v21.h }[0], [x21], #0x2\n"
+    "st1 { v20.h }[0], [x20], #0x2\n"
+    "st1 { v19.h }[0], [x19], #0x2\n"
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v27.b }[2], [x27], #0x1\n"
+    "st1 { v26.b }[2], [x26], #0x1\n"
+    "st1 { v25.b }[2], [x25], #0x1\n"
+    "st1 { v24.b }[2], [x24], #0x1\n"
+    "st1 { v23.b }[2], [x23], #0x1\n"
+    "st1 { v22.b }[2], [x22], #0x1\n"
+    "st1 { v21.b }[2], [x21], #0x1\n"
+    "st1 { v20.b }[2], [x20], #0x1\n"
+    "st1 { v19.b }[2], [x19], #0x1\n"
+    "b 23f\n"
+    "22:"  // Oddments: Store: Bit 1: Unset
+    "tbz %x[n_channels], #0, 23f\n"
+    "st1 { v27.b }[0], [x27], #0x1\n"
+    "st1 { v26.b }[0], [x26], #0x1\n"
+    "st1 { v25.b }[0], [x25], #0x1\n"
+    "st1 { v24.b }[0], [x24], #0x1\n"
+    "st1 { v23.b }[0], [x23], #0x1\n"
+    "st1 { v22.b }[0], [x22], #0x1\n"
+    "st1 { v21.b }[0], [x21], #0x1\n"
+    "st1 { v20.b }[0], [x20], #0x1\n"
+    "st1 { v19.b }[0], [x19], #0x1\n"
+    "23:"  // Oddments: Store: Bit 1: End
+
+    "24:"  // End
+
+    : [params] "+&r" (params)
+    : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000..8020305
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef int8_t weight_type;
+  typedef uint8_t return_type;
+
+  typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+  constexpr static unsigned int output_rows(void) { return 2; };
+  constexpr static unsigned int output_cols(void) { return 8; };
+
+  constexpr static unsigned int output_col_regs(void) { return 2; };
+
+  kern_type kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+  a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..152999d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp

@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const int8_t *weights,
+  const int32_t *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const int32_t *per_channel_left_shifts,
+  const int32_t *per_channel_muls,
+  const int32_t *per_channel_right_shifts,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov x9, #0x0\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
+    "ld1r { v14.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
+    "ld1r { v13.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+    "ld1r { v12.16b }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+    "ld1r { v11.16b }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+    "ld1r { v10.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+    "ld1r { v9.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+    "ld1r { v8.4s }, [x19]\n"
+    "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+    "ld1r { v7.4s }, [x19]\n"
+    "lsr x28, %x[n_output_channels], #0x2\n"
+    "cbz x28, 9f\n"
+    "1:"  // Output channel loop
+    "movi v16.4s, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "lsl x19, x9, #0x2\n"
+    "ldr q16, [%x[bias], x19]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov v6.16b, v16.16b\n"
+    "mov v5.16b, v16.16b\n"
+    "mov v4.16b, v16.16b\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v16.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "mov v28.16b, v16.16b\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "mov v20.16b, v16.16b\n"
+    "mov v19.16b, v16.16b\n"
+    "cbz %x[rq_mul_ptr], 3f\n"
+    "lsl x19, x9, #0x2\n"
+    "ldr q8, [%x[rq_mul_ptr], x19]\n"
+    "ldr q7, [%x[rq_right_shift_ptr], x19]\n"
+    "cbz %x[rq_left_shift_ptr], 3f\n"
+    "ldr q9, [%x[rq_left_shift_ptr], x19]\n"
+    "3:"  // Output channel loop: Load quantization parameters: Done
+    "ldr s17, [%x[weights]], #0x4\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "mov x19, %x[inptrs]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "ldr d3, [x25, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "cbz x20, 7f\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "ssubl v16.8h, v16.8b, v11.8b\n"
+    "ldr d1, [x25, #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "usubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d0, [x27, #0x0]\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "beq 5f\n"
+    "4:"  // Output channel loop: Kernel loop
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "ldr d1, [x25, #0x0]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "ldr d0, [x27, #0x0]\n"
+    "usubl v1.8h, v1.8b, v12.8b\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "ssubl v16.8h, v16.8b, v11.8b\n"
+    "bgt 4b\n"
+    "5:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 6f\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "b 8f\n"
+    "6:"  // Output channel loop: Odd tail
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "b 8f\n"
+    "7:"  // Output channel loop: Single kernel point
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "str s6, [x19, x9]\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "str s5, [x20, x9]\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "str s4, [x21, x9]\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "str s31, [x22, x9]\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "str s30, [x23, x9]\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "str s29, [x24, x9]\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "str s28, [x25, x9]\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "str s27, [x26, x9]\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "str s26, [x19, x9]\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "str s25, [x20, x9]\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "str s24, [x21, x9]\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "str s23, [x22, x9]\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "str s22, [x23, x9]\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "str s21, [x24, x9]\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "str s20, [x25, x9]\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "str s19, [x26, x9]\n"
+    "8:"  // Output channel loop: Done
+    "add x9, x9, #0x4\n"
+    "cmp x9, x28, LSL #2\n"
+    "blt 1b\n"
+    "tst %x[n_output_channels], #0x3\n"
+    "beq 26f\n"
+    "9:"  // Output channel oddments
+    "movi v16.4s, #0x0\n"
+    "cbz %x[bias], 12f\n"
+    "add x19, %x[bias], x9, LSL #2\n"
+    "tbz %x[n_output_channels], #1, 10f\n"
+    "ld1 { v16.d }[0], [x19], #0x8\n"
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v16.s }[2], [x19]\n"
+    "b 11f\n"
+    "10:"  // Output channel oddments: Load bias: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 11f\n"
+    "ld1 { v16.s }[0], [x19]\n"
+    "11:"  // Output channel oddments: Load bias: Bit 1: End
+
+    "12:"  // Output channel oddments: Load bias: Done
+    "mov v6.16b, v16.16b\n"
+    "mov v5.16b, v16.16b\n"
+    "mov v4.16b, v16.16b\n"
+    "mov v31.16b, v16.16b\n"
+    "mov v30.16b, v16.16b\n"
+    "mov v29.16b, v16.16b\n"
+    "mov v28.16b, v16.16b\n"
+    "mov v27.16b, v16.16b\n"
+    "mov v26.16b, v16.16b\n"
+    "mov v25.16b, v16.16b\n"
+    "mov v24.16b, v16.16b\n"
+    "mov v23.16b, v16.16b\n"
+    "mov v22.16b, v16.16b\n"
+    "mov v21.16b, v16.16b\n"
+    "mov v20.16b, v16.16b\n"
+    "mov v19.16b, v16.16b\n"
+    "cbz %x[rq_mul_ptr], 18f\n"
+    "add x21, %x[rq_mul_ptr], x9, LSL #2\n"
+    "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n"
+    "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n"
+    "cbz %x[rq_left_shift_ptr], 15f\n"
+    "tbz %x[n_output_channels], #1, 13f\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "ld1 { v9.d }[0], [x19], #0x8\n"
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v7.s }[2], [x20], #0x4\n"
+    "ld1 { v9.s }[2], [x19], #0x4\n"
+    "b 14f\n"
+    "13:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 14f\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v7.s }[0], [x20], #0x4\n"
+    "ld1 { v9.s }[0], [x19], #0x4\n"
+    "14:"  // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+    "b 18f\n"
+    "15:"  // Output channel oddments: Load quantization parameters: No left shift
+    "tbz %x[n_output_channels], #1, 16f\n"
+    "ld1 { v8.d }[0], [x21], #0x8\n"
+    "ld1 { v7.d }[0], [x20], #0x8\n"
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v8.s }[2], [x21], #0x4\n"
+    "ld1 { v7.s }[2], [x20], #0x4\n"
+    "b 17f\n"
+    "16:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 17f\n"
+    "ld1 { v8.s }[0], [x21], #0x4\n"
+    "ld1 { v7.s }[0], [x20], #0x4\n"
+    "17:"  // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+
+    "18:"  // Output channel oddments: Load quantization parameters: Done
+    "ldr s17, [%x[weights]], #0x4\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "mov x19, %x[inptrs]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "lsr x20, %x[kernel_points], #0x1\n"
+    "ldr d3, [x25, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "cbz x20, 22f\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "ssubl v16.8h, v16.8b, v11.8b\n"
+    "ldr d1, [x25, #0x0]\n"
+    "subs x20, x20, #0x1\n"
+    "usubl v1.8h, v1.8b, v12.8b\n"
+    "ldr d0, [x27, #0x0]\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "beq 20f\n"
+    "19:"  // Output channel oddments: Kernel loop
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "subs x20, x20, #0x1\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "ldr d1, [x25, #0x0]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "ldr d0, [x27, #0x0]\n"
+    "usubl v1.8h, v1.8b, v12.8b\n"
+    "ldr s16, [%x[weights]], #0x4\n"
+    "usubl v0.8h, v0.8b, v12.8b\n"
+    "ssubl v16.8h, v16.8b, v11.8b\n"
+    "bgt 19b\n"
+    "20:"  // Output channel oddments: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 21f\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "b 23f\n"
+    "21:"  // Output channel oddments: Odd tail
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "ldp x25, x27, [x19], #0x10\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "ldr d3, [x25, #0x0]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "ldr d2, [x27, #0x0]\n"
+    "usubl v3.8h, v3.8b, v12.8b\n"
+    "ldr s17, [%x[weights]], #0x4\n"
+    "smlal v6.4s, v16.4h, v1.h[0]\n"
+    "smlal v5.4s, v16.4h, v1.h[1]\n"
+    "smlal v4.4s, v16.4h, v1.h[2]\n"
+    "usubl v2.8h, v2.8b, v12.8b\n"
+    "ssubl v17.8h, v17.8b, v11.8b\n"
+    "smlal v31.4s, v16.4h, v1.h[3]\n"
+    "smlal v30.4s, v16.4h, v1.h[4]\n"
+    "smlal v29.4s, v16.4h, v1.h[5]\n"
+    "smlal v28.4s, v16.4h, v1.h[6]\n"
+    "smlal v27.4s, v16.4h, v1.h[7]\n"
+    "smlal v26.4s, v16.4h, v0.h[0]\n"
+    "smlal v25.4s, v16.4h, v0.h[1]\n"
+    "smlal v24.4s, v16.4h, v0.h[2]\n"
+    "smlal v23.4s, v16.4h, v0.h[3]\n"
+    "smlal v22.4s, v16.4h, v0.h[4]\n"
+    "smlal v21.4s, v16.4h, v0.h[5]\n"
+    "smlal v20.4s, v16.4h, v0.h[6]\n"
+    "smlal v19.4s, v16.4h, v0.h[7]\n"
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "b 23f\n"
+    "22:"  // Output channel oddments: Single kernel point
+    "smlal v6.4s, v17.4h, v3.h[0]\n"
+    "smlal v5.4s, v17.4h, v3.h[1]\n"
+    "smlal v4.4s, v17.4h, v3.h[2]\n"
+    "smlal v31.4s, v17.4h, v3.h[3]\n"
+    "smlal v30.4s, v17.4h, v3.h[4]\n"
+    "smlal v29.4s, v17.4h, v3.h[5]\n"
+    "smlal v28.4s, v17.4h, v3.h[6]\n"
+    "smlal v27.4s, v17.4h, v3.h[7]\n"
+    "smlal v26.4s, v17.4h, v2.h[0]\n"
+    "smlal v25.4s, v17.4h, v2.h[1]\n"
+    "smlal v24.4s, v17.4h, v2.h[2]\n"
+    "smlal v23.4s, v17.4h, v2.h[3]\n"
+    "smlal v22.4s, v17.4h, v2.h[4]\n"
+    "smlal v21.4s, v17.4h, v2.h[5]\n"
+    "smlal v20.4s, v17.4h, v2.h[6]\n"
+    "smlal v19.4s, v17.4h, v2.h[7]\n"
+    "23:"  // Output channel oddments: Done
+    "sshl v6.4s, v6.4s, v9.4s\n"
+    "sshl v5.4s, v5.4s, v9.4s\n"
+    "sshl v4.4s, v4.4s, v9.4s\n"
+    "sqrdmulh v6.4s, v6.4s, v8.4s\n"
+    "sqrdmulh v5.4s, v5.4s, v8.4s\n"
+    "sqrdmulh v4.4s, v4.4s, v8.4s\n"
+    "sshl v31.4s, v31.4s, v9.4s\n"
+    "and v18.16b, v6.16b, v7.16b\n"
+    "and v16.16b, v5.16b, v7.16b\n"
+    "and v17.16b, v4.16b, v7.16b\n"
+    "sshr v18.4s, v18.4s, #0x1f\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqadd v6.4s, v6.4s, v18.4s\n"
+    "sqadd v5.4s, v5.4s, v16.4s\n"
+    "sqadd v4.4s, v4.4s, v17.4s\n"
+    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+    "srshl v6.4s, v6.4s, v7.4s\n"
+    "srshl v5.4s, v5.4s, v7.4s\n"
+    "srshl v4.4s, v4.4s, v7.4s\n"
+    "and v16.16b, v31.16b, v7.16b\n"
+    "add v6.4s, v6.4s, v10.4s\n"
+    "add v5.4s, v5.4s, v10.4s\n"
+    "add v4.4s, v4.4s, v10.4s\n"
+    "smin v6.4s, v6.4s, v13.4s\n"
+    "smin v5.4s, v5.4s, v13.4s\n"
+    "smin v4.4s, v4.4s, v13.4s\n"
+    "smax v6.4s, v6.4s, v14.4s\n"
+    "smax v5.4s, v5.4s, v14.4s\n"
+    "smax v4.4s, v4.4s, v14.4s\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v6.16b, v6.16b, v6.16b\n"
+    "uzp1 v5.16b, v5.16b, v5.16b\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v4.16b, v4.16b, v4.16b\n"
+    "sshl v30.4s, v30.4s, v9.4s\n"
+    "sqadd v31.4s, v31.4s, v16.4s\n"
+    "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+    "sshl v29.4s, v29.4s, v9.4s\n"
+    "sshl v28.4s, v28.4s, v9.4s\n"
+    "srshl v31.4s, v31.4s, v7.4s\n"
+    "and v16.16b, v30.16b, v7.16b\n"
+    "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+    "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+    "add v31.4s, v31.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "and v17.16b, v29.16b, v7.16b\n"
+    "smin v31.4s, v31.4s, v13.4s\n"
+    "sqadd v30.4s, v30.4s, v16.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "smax v31.4s, v31.4s, v14.4s\n"
+    "and v16.16b, v28.16b, v7.16b\n"
+    "srshl v30.4s, v30.4s, v7.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "sqadd v29.4s, v29.4s, v17.4s\n"
+    "uzp1 v31.16b, v31.16b, v31.16b\n"
+    "add v30.4s, v30.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "srshl v29.4s, v29.4s, v7.4s\n"
+    "smin v30.4s, v30.4s, v13.4s\n"
+    "sqadd v28.4s, v28.4s, v16.4s\n"
+    "sshl v27.4s, v27.4s, v9.4s\n"
+    "smax v30.4s, v30.4s, v14.4s\n"
+    "add v29.4s, v29.4s, v10.4s\n"
+    "srshl v28.4s, v28.4s, v7.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "smin v29.4s, v29.4s, v13.4s\n"
+    "uzp1 v30.16b, v30.16b, v30.16b\n"
+    "add v28.4s, v28.4s, v10.4s\n"
+    "smax v29.4s, v29.4s, v14.4s\n"
+    "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+    "smin v28.4s, v28.4s, v13.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "sshl v26.4s, v26.4s, v9.4s\n"
+    "uzp1 v29.16b, v29.16b, v29.16b\n"
+    "smax v28.4s, v28.4s, v14.4s\n"
+    "and v16.16b, v27.16b, v7.16b\n"
+    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v28.16b, v28.16b, v28.16b\n"
+    "and v17.16b, v26.16b, v7.16b\n"
+    "sqadd v27.4s, v27.4s, v16.4s\n"
+    "sshl v25.4s, v25.4s, v9.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+    "srshl v27.4s, v27.4s, v7.4s\n"
+    "sqadd v26.4s, v26.4s, v17.4s\n"
+    "sshl v24.4s, v24.4s, v9.4s\n"
+    "and v16.16b, v25.16b, v7.16b\n"
+    "add v27.4s, v27.4s, v10.4s\n"
+    "srshl v26.4s, v26.4s, v7.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v27.4s, v27.4s, v13.4s\n"
+    "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+    "add v26.4s, v26.4s, v10.4s\n"
+    "smax v27.4s, v27.4s, v14.4s\n"
+    "sqadd v25.4s, v25.4s, v16.4s\n"
+    "smin v26.4s, v26.4s, v13.4s\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "and v17.16b, v24.16b, v7.16b\n"
+    "uzp1 v27.16b, v27.16b, v27.16b\n"
+    "smax v26.4s, v26.4s, v14.4s\n"
+    "srshl v25.4s, v25.4s, v7.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "sshl v23.4s, v23.4s, v9.4s\n"
+    "uzp1 v26.16b, v26.16b, v26.16b\n"
+    "add v25.4s, v25.4s, v10.4s\n"
+    "sqadd v24.4s, v24.4s, v17.4s\n"
+    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+    "smin v25.4s, v25.4s, v13.4s\n"
+    "sshl v22.4s, v22.4s, v9.4s\n"
+    "srshl v24.4s, v24.4s, v7.4s\n"
+    "smax v25.4s, v25.4s, v14.4s\n"
+    "and v16.16b, v23.16b, v7.16b\n"
+    "sqrdmulh v22.4s, v22.4s, v8.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "add v24.4s, v24.4s, v10.4s\n"
+    "uzp1 v25.16b, v25.16b, v25.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "smin v24.4s, v24.4s, v13.4s\n"
+    "and v17.16b, v22.16b, v7.16b\n"
+    "sqadd v23.4s, v23.4s, v16.4s\n"
+    "smax v24.4s, v24.4s, v14.4s\n"
+    "sshr v17.4s, v17.4s, #0x1f\n"
+    "sshl v21.4s, v21.4s, v9.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "srshl v23.4s, v23.4s, v7.4s\n"
+    "uzp1 v24.16b, v24.16b, v24.16b\n"
+    "sqadd v22.4s, v22.4s, v17.4s\n"
+    "sqrdmulh v21.4s, v21.4s, v8.4s\n"
+    "add v23.4s, v23.4s, v10.4s\n"
+    "sshl v20.4s, v20.4s, v9.4s\n"
+    "srshl v22.4s, v22.4s, v7.4s\n"
+    "smin v23.4s, v23.4s, v13.4s\n"
+    "and v16.16b, v21.16b, v7.16b\n"
+    "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+    "smax v23.4s, v23.4s, v14.4s\n"
+    "add v22.4s, v22.4s, v10.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "smin v22.4s, v22.4s, v13.4s\n"
+    "uzp1 v23.16b, v23.16b, v23.16b\n"
+    "sqadd v21.4s, v21.4s, v16.4s\n"
+    "smax v22.4s, v22.4s, v14.4s\n"
+    "and v16.16b, v20.16b, v7.16b\n"
+    "sshl v19.4s, v19.4s, v9.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "srshl v21.4s, v21.4s, v7.4s\n"
+    "uzp1 v22.16b, v22.16b, v22.16b\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "sqrdmulh v19.4s, v19.4s, v8.4s\n"
+    "add v21.4s, v21.4s, v10.4s\n"
+    "sqadd v20.4s, v20.4s, v16.4s\n"
+    "smin v21.4s, v21.4s, v13.4s\n"
+    "and v16.16b, v19.16b, v7.16b\n"
+    "srshl v20.4s, v20.4s, v7.4s\n"
+    "smax v21.4s, v21.4s, v14.4s\n"
+    "sshr v16.4s, v16.4s, #0x1f\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "add v20.4s, v20.4s, v10.4s\n"
+    "sqadd v19.4s, v19.4s, v16.4s\n"
+    "uzp1 v21.16b, v21.16b, v21.16b\n"
+    "smin v20.4s, v20.4s, v13.4s\n"
+    "srshl v19.4s, v19.4s, v7.4s\n"
+    "smax v20.4s, v20.4s, v14.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "add v19.4s, v19.4s, v10.4s\n"
+    "uzp1 v20.16b, v20.16b, v20.16b\n"
+    "smin v19.4s, v19.4s, v13.4s\n"
+    "smax v19.4s, v19.4s, v14.4s\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "uzp1 v19.16b, v19.16b, v19.16b\n"
+    "tbz %x[n_output_channels], #1, 24f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.h }[0], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.h }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.h }[0], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.h }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.h }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.h }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.h }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.h }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.h }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.h }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.h }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.h }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.h }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.h }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.h }[0], [x25]\n"
+    "add x9, x9, #0x2\n"
+    "st1 { v19.h }[0], [x26]\n"
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.b }[2], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.b }[2], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.b }[2], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.b }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.b }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.b }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.b }[2], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.b }[2], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.b }[2], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.b }[2], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.b }[2], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.b }[2], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.b }[2], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.b }[2], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.b }[2], [x25]\n"
+    "st1 { v19.b }[2], [x26]\n"
+    "b 25f\n"
+    "24:"  // Output channel oddments: Done: Store: Bit 1: Unset
+    "tbz %x[n_output_channels], #0, 25f\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "add x19, x19, x9\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "add x20, x20, x9\n"
+    "st1 { v6.b }[0], [x19]\n"
+    "add x21, x21, x9\n"
+    "st1 { v5.b }[0], [x20]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "add x22, x22, x9\n"
+    "st1 { v4.b }[0], [x21]\n"
+    "add x23, x23, x9\n"
+    "st1 { v31.b }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "add x24, x24, x9\n"
+    "st1 { v30.b }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "add x25, x25, x9\n"
+    "st1 { v29.b }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "add x26, x26, x9\n"
+    "st1 { v28.b }[0], [x25]\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "add x19, x19, x9\n"
+    "st1 { v27.b }[0], [x26]\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "add x20, x20, x9\n"
+    "st1 { v26.b }[0], [x19]\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "add x21, x21, x9\n"
+    "st1 { v25.b }[0], [x20]\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "add x22, x22, x9\n"
+    "st1 { v24.b }[0], [x21]\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "add x23, x23, x9\n"
+    "st1 { v23.b }[0], [x22]\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "add x24, x24, x9\n"
+    "st1 { v22.b }[0], [x23]\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "add x25, x25, x9\n"
+    "st1 { v21.b }[0], [x24]\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "add x26, x26, x9\n"
+    "st1 { v20.b }[0], [x25]\n"
+    "st1 { v19.b }[0], [x26]\n"
+    "25:"  // Output channel oddments: Done: Store: Bit 1: End
+
+    "26:"  // Done
+
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..c444472
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..b788c70
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x17, #0x0\n"
+    "mov x16, #0x0\n"
+    "1:"  // Tile loop
+    "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x23, #0x2\n"
+    "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x15, #0x2\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mov x13, #0x0\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cnth x12\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "sub x21, XZR, x12\n"
+    "ldr x10, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x17, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x16, x11, x19\n" // offset += tile_j * ld_input_col
+    "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x23\n" // offset *= kernel_stride * output_size
+    "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x10, x10, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x27, x10, x22, LSL #1\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x26, x27, x22, LSL #1\n"
+    "ld1h { z16.h }, p3/Z, [x14]\n" // Load from weights and bias
+    "mov z31.d, z16.d\n"
+    "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n" // Load from weights and bias
+    "add x25, x26, x22, LSL #1\n"
+    "mov z30.d, z16.d\n"
+    "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n" // Load from weights and bias
+    "add x24, x11, x11\n"
+    "mov z29.d, z16.d\n"
+    "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n" // Load from weights and bias
+    "add x23, x24, x11\n"
+    "mov z28.d, z16.d\n"
+    "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n" // Load from weights and bias
+    "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
+    "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n" // Load from weights and bias
+    "madd x19, x16, x9, x19\n" // offset += tile_j * ld_output_col
+    "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n" // Load from weights and bias
+    "mul x19, x19, x15\n" // offset *= output_tile_size
+    "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n" // Load from weights and bias
+    "add x28, x28, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z9.h }, p2/Z, [x27, x11, LSL #1]\n" // Load input point (1, 1)
+    "ld1h { z10.h }, p2/Z, [x10]\n" // Load input point (0, 0)
+    "add x22, x28, x20, LSL #1\n"
+    "ld1h { z11.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (0, 3)
+    "addvl x14, x14, #16\n"
+    "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (1, 2)
+    "cmp x12, %x[n_channels]\n"
+    "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n" // Load from weights and bias
+    "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x14, x14, #-6\n"
+    "ld1h { z13.h }, p2/Z, [x26, x11, LSL #1]\n" // Load input point (2, 1)
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z16.h }, p3/Z, [x14]\n" // Load from weights and bias
+    "whilelt p1.h, x12, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "inch x21\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x25]\n" // Load input point (3, 0)
+    "inch x13\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (2, 2)
+    "inch x12\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x23, LSL #1]\n" // Load input point (3, 3)
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z31.h, p3/M, z5.h, z12.h\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n" // Load input point (0, 1)
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (0, 2)
+    "addvl x10, x10, #1\n"
+    "fmla z28.h, p3/M, z3.h, z13.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (1, 0)
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (1, 3)
+    "addvl x27, x27, #1\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26]\n" // Load input point (2, 0)
+    "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (2, 3)
+    "addvl x26, x26, #1\n"
+    "fmla z29.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z13.h }, p1/Z, [x26, x11, LSL #1]\n" // Load input point (2, 1)
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x11, LSL #1]\n" // Load input point (3, 1)
+    "fmla z28.h, p3/M, z5.h, z10.h\n"
+    "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x25, x24, LSL #1]\n" // Load input point (3, 2)
+    "whilelt p2.h, x13, %x[n_channels]\n"
+    "fmla z29.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n" // Load from weights and bias
+    "addvl x25, x25, #1\n"
+    "fmla z31.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x27, x11, LSL #1]\n" // Load input point (1, 1)
+    "cmp x12, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x10]\n" // Load input point (0, 0)
+    "fmla z28.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x10, x23, LSL #1]\n" // Load input point (0, 3)
+    "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n" // Load from weights and bias
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "addvl x14, x14, #16\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x27, x24, LSL #1]\n" // Load input point (1, 2)
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n" // Load from weights and bias
+    "addvl x14, x14, #-6\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x28]\n" // Store output point (0, 0)
+    "mov z31.d, z16.d\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "st1h { z30.h }, p0, [x28, x9, LSL #1]\n" // Store output point (0, 1)
+    "mov z30.d, z16.d\n"
+    "addvl x28, x28, #1\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+    "mov z29.d, z16.d\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z28.h }, p0, [x22, x9, LSL #1]\n" // Store output point (1, 1)
+    "mov z28.d, z16.d\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.h, p3/M, z4.h, z9.h\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x21, x17, #0x1\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x25]\n" // Load input point (3, 0)
+    "add x16, x16, #0x1\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (2, 2)
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x23, LSL #1]\n" // Load input point (3, 3)
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "cmp x16, x19\n"
+    "fmla z31.h, p3/M, z5.h, z12.h\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "csel x16, x16, XZR, LT\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n" // Load input point (0, 1)
+    "csel x17, x17, x21, LT\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (0, 2)
+    "cmp x17, x20\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z28.h, p3/M, z3.h, z13.h\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (1, 3)
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (1, 0)
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26]\n" // Load input point (2, 0)
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "fmla z31.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (2, 3)
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x11, LSL #1]\n" // Load input point (3, 1)
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x25, x24, LSL #1]\n" // Load input point (3, 2)
+    "fmla z28.h, p3/M, z5.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z9.h\n"
+    "fmla z30.h, p3/M, z8.h, z10.h\n"
+    "fmla z28.h, p3/M, z6.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x28]\n" // Store output point (0, 0)
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z30.h }, p0, [x28, x9, LSL #1]\n" // Store output point (0, 1)
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z28.h }, p0, [x22, x9, LSL #1]\n" // Store output point (1, 1)
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..d8f905b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[16];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[2];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[7];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[10];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[12];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x3, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x19, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mov x5, #0x0\n"
+    "ldp x6, x7, [x19, #0x0]\n"
+    "cnth x8\n"
+    "ldp x17, x16, [x19, #0x10]\n"
+    "sub x15, XZR, x8\n"
+    "ldp x14, x13, [x19, #0x20]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ldp x12, x11, [x19, #0x30]\n"
+    "cmp x8, %x[n_channels]\n"
+    "ldp x10, x9, [x19, #0x40]\n"
+    "ldp x28, x27, [x19, #0x50]\n"
+    "ldp x26, x25, [x19, #0x60]\n"
+    "ldp x24, x23, [x19, #0x70]\n"
+    "ldp x22, x21, [x3, #0x0]\n"
+    "ldp x20, x19, [x3, #0x10]\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1h { z16.h }, p3/Z, [x4]\n" // Load from weights and bias
+    "mov z31.d, z16.d\n"
+    "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias
+    "mov z30.d, z16.d\n"
+    "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias
+    "mov z29.d, z16.d\n"
+    "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias
+    "mov z28.d, z16.d\n"
+    "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias
+    "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias
+    "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias
+    "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x4, x4, #16\n"
+    "ld1h { z9.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias
+    "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x4, x4, #-6\n"
+    "ld1h { z10.h }, p2/Z, [x6, x5, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x9, x5, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z16.h }, p3/Z, [x4]\n" // Load from weights and bias
+    "whilelt p1.h, x8, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "inch x15\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x23, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z31.h, p3/M, z5.h, z12.h\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x17, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z3.h, z13.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x9, x8, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x10, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias
+    "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z10.h\n"
+    "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "inch x5\n"
+    "fmla z29.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias
+    "whilelt p2.h, x5, %x[n_channels]\n"
+    "fmla z31.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x13, x8, LSL #1]\n"
+    "fmla z28.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x16, x8, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x6, x8, LSL #1]\n"
+    "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "addvl x4, x4, #16\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x12, x8, LSL #1]\n"
+    "inch x8\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias
+    "cmp x8, %x[n_channels]\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x4, x4, #-6\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x22, x15, LSL #1]\n"
+    "mov z31.d, z16.d\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "st1h { z30.h }, p0, [x21, x15, LSL #1]\n"
+    "mov z30.d, z16.d\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z29.h }, p0, [x20, x15, LSL #1]\n"
+    "mov z29.d, z16.d\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z28.h }, p0, [x19, x15, LSL #1]\n"
+    "mov z28.d, z16.d\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.h, p3/M, z4.h, z9.h\n"
+    "inch x15\n"
+    "fmla z30.h, p3/M, z3.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.h, p3/M, z1.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x23, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z31.h, p3/M, z5.h, z12.h\n"
+    "fmla z30.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x7, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x17, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z3.h, z13.h\n"
+    "fmla z31.h, p3/M, z7.h, z13.h\n"
+    "fmla z30.h, p3/M, z6.h, z13.h\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x10, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "fmla z28.h, p3/M, z2.h, z12.h\n"
+    "fmla z31.h, p3/M, z8.h, z10.h\n"
+    "fmla z30.h, p3/M, z7.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x27, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z3.h, z9.h\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x25, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z5.h, z10.h\n"
+    "fmla z29.h, p3/M, z7.h, z11.h\n"
+    "fmla z31.h, p3/M, z6.h, z9.h\n"
+    "fmla z30.h, p3/M, z8.h, z10.h\n"
+    "fmla z28.h, p3/M, z6.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x22, x15, LSL #1]\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z30.h }, p0, [x21, x15, LSL #1]\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "st1h { z29.h }, p0, [x20, x15, LSL #1]\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z28.h }, p0, [x19, x15, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000..f5d31e6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..aebf0bf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x6, #0x0\n"
+    "mov x7, #0x0\n"
+    "1:"  // Tile loop
+    "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x24, #0x3\n"
+    "str x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x23, #0x3\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mov x17, #0x0\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cnth x16\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "sub x21, XZR, x16\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x6, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x7, x15, x19\n" // offset += tile_j * ld_input_col
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
+    "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x14, x14, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x11, x14, x22, LSL #1\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x10, x11, x22, LSL #1\n"
+    "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias
+    "mov z31.d, z16.d\n"
+    "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+    "add x9, x10, x22, LSL #1\n"
+    "mov z30.d, z16.d\n"
+    "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+    "add x28, x9, x22, LSL #1\n"
+    "mov z29.d, z16.d\n"
+    "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+    "add x27, x15, x15\n"
+    "mov z28.d, z16.d\n"
+    "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+    "add x26, x27, x15\n"
+    "mov z27.d, z16.d\n"
+    "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+    "add x25, x26, x15\n"
+    "mov z26.d, z16.d\n"
+    "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+    "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
+    "mov z25.d, z16.d\n"
+    "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+    "madd x19, x7, x13, x19\n" // offset += tile_j * ld_output_col
+    "mov z24.d, z16.d\n"
+    "mul x19, x19, x23\n" // offset *= output_tile_size
+    "mov z23.d, z16.d\n"
+    "add x12, x12, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "add x24, x13, x13\n"
+    "add x23, x12, x20, LSL #1\n"
+    "add x22, x23, x20, LSL #1\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z9.h }, p2/Z, [x10, x27, LSL #1]\n" // Load input point (2, 2)
+    "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (0, 0)
+    "addvl x8, x8, #16\n"
+    "ld1h { z11.h }, p2/Z, [x14, x25, LSL #1]\n" // Load input point (0, 4)
+    "cmp x16, %x[n_channels]\n"
+    "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+    "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x8, x8, #-6\n"
+    "ld1h { z12.h }, p2/Z, [x28]\n" // Load input point (4, 0)
+    "ld1h { z13.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (1, 2)
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias
+    "whilelt p1.h, x16, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z7.h, z9.h\n"
+    "inch x21\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z5.h, z9.h\n"
+    "inch x17\n"
+    "fmla z27.h, p3/M, z4.h, z9.h\n"
+    "inch x16\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "fmla z25.h, p3/M, z2.h, z9.h\n"
+    "fmla z24.h, p3/M, z1.h, z9.h\n"
+    "fmla z23.h, p3/M, z0.h, z9.h\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 3)
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1)
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 4)
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z26.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n" // Load input point (0, 1)
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n" // Load input point (0, 3)
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "fmla z30.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z1.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11]\n" // Load input point (1, 0)
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x11, x25, LSL #1]\n" // Load input point (1, 4)
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9]\n" // Load input point (3, 0)
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z24.h, p3/M, z2.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n" // Load input point (3, 2)
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 4)
+    "fmla z29.h, p3/M, z5.h, z13.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1)
+    "fmla z25.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n" // Load input point (1, 1)
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z26.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z24.h, p3/M, z4.h, z10.h\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z13.h\n"
+    "fmla z24.h, p3/M, z6.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 3)
+    "fmla z23.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x26, LSL #1]\n" // Load input point (1, 3)
+    "addvl x11, x11, #1\n"
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1)
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z5.h, z11.h\n"
+    "fmla z26.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (0, 2)
+    "addvl x14, x14, #1\n"
+    "fmla z24.h, p3/M, z8.h, z13.h\n"
+    "ld1h { z10.h }, p1/Z, [x14]\n" // Load input point (0, 0)
+    "fmla z23.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 3)
+    "addvl x9, x9, #1\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z4.h, z12.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x10]\n" // Load input point (2, 0)
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 4)
+    "addvl x10, x10, #1\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "ld1h { z9.h }, p1/Z, [x10, x27, LSL #1]\n" // Load input point (2, 2)
+    "fmla z26.h, p3/M, z7.h, z13.h\n"
+    "fmla z24.h, p3/M, z5.h, z13.h\n"
+    "fmla z23.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n" // Load input point (4, 2)
+    "whilelt p2.h, x17, %x[n_channels]\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+    "addvl x28, x28, #1\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+    "cmp x16, %x[n_channels]\n"
+    "fmla z25.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x28]\n" // Load input point (4, 0)
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n" // Load input point (0, 4)
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z25.h, p3/M, z8.h, z13.h\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x11, x27, LSL #1]\n" // Load input point (1, 2)
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x8, x8, #16\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x8, x8, #-6\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "st1h { z31.h }, p0, [x12]\n" // Store output point (0, 0)
+    "mov z31.d, z16.d\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z30.h }, p0, [x12, x13, LSL #1]\n" // Store output point (0, 1)
+    "mov z30.d, z16.d\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "st1h { z29.h }, p0, [x12, x24, LSL #1]\n" // Store output point (0, 2)
+    "mov z29.d, z16.d\n"
+    "addvl x12, x12, #1\n"
+    "fmax z27.h, p3/M, z27.h, z18.h\n"
+    "fmax z26.h, p3/M, z26.h, z18.h\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z28.h }, p0, [x23]\n" // Store output point (1, 0)
+    "mov z28.d, z16.d\n"
+    "fmin z27.h, p3/M, z27.h, z17.h\n"
+    "st1h { z27.h }, p0, [x23, x13, LSL #1]\n" // Store output point (1, 1)
+    "mov z27.d, z16.d\n"
+    "fmin z26.h, p3/M, z26.h, z17.h\n"
+    "st1h { z26.h }, p0, [x23, x24, LSL #1]\n" // Store output point (1, 2)
+    "mov z26.d, z16.d\n"
+    "addvl x23, x23, #1\n"
+    "fmax z25.h, p3/M, z25.h, z18.h\n"
+    "fmax z24.h, p3/M, z24.h, z18.h\n"
+    "fmax z23.h, p3/M, z23.h, z18.h\n"
+    "fmin z25.h, p3/M, z25.h, z17.h\n"
+    "st1h { z25.h }, p0, [x22]\n" // Store output point (2, 0)
+    "mov z25.d, z16.d\n"
+    "fmin z24.h, p3/M, z24.h, z17.h\n"
+    "st1h { z24.h }, p0, [x22, x13, LSL #1]\n" // Store output point (2, 1)
+    "mov z24.d, z16.d\n"
+    "fmin z23.h, p3/M, z23.h, z17.h\n"
+    "st1h { z23.h }, p0, [x22, x24, LSL #1]\n" // Store output point (2, 2)
+    "mov z23.d, z16.d\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.h, p3/M, z7.h, z9.h\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x21, x6, #0x1\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z28.h, p3/M, z5.h, z9.h\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x7, x7, #0x1\n"
+    "fmla z27.h, p3/M, z4.h, z9.h\n"
+    "cmp x7, x19\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "fmla z25.h, p3/M, z2.h, z9.h\n"
+    "csel x7, x7, XZR, LT\n"
+    "fmla z24.h, p3/M, z1.h, z9.h\n"
+    "csel x6, x6, x21, LT\n"
+    "fmla z23.h, p3/M, z0.h, z9.h\n"
+    "cmp x6, x20\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 3)
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1)
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 4)
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "fmla z31.h, p3/M, z5.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "fmla z26.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n" // Load input point (0, 1)
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n" // Load input point (0, 3)
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "fmla z30.h, p3/M, z6.h, z11.h\n"
+    "fmla z28.h, p3/M, z4.h, z11.h\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "fmla z25.h, p3/M, z1.h, z11.h\n"
+    "fmla z24.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11]\n" // Load input point (1, 0)
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x11, x25, LSL #1]\n" // Load input point (1, 4)
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9]\n" // Load input point (3, 0)
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "fmla z24.h, p3/M, z2.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n" // Load input point (3, 2)
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 4)
+    "fmla z29.h, p3/M, z5.h, z13.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1)
+    "fmla z25.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n" // Load input point (1, 1)
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z26.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z24.h, p3/M, z4.h, z10.h\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z13.h\n"
+    "fmla z24.h, p3/M, z6.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 3)
+    "fmla z23.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x26, LSL #1]\n" // Load input point (1, 3)
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1)
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z5.h, z11.h\n"
+    "fmla z26.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (0, 2)
+    "fmla z24.h, p3/M, z8.h, z13.h\n"
+    "fmla z23.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 3)
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z4.h, z12.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x10]\n" // Load input point (2, 0)
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 4)
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z7.h, z13.h\n"
+    "fmla z24.h, p3/M, z5.h, z13.h\n"
+    "fmla z23.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n" // Load input point (4, 2)
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z12.h\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z25.h, p3/M, z8.h, z13.h\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x12]\n" // Store output point (0, 0)
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z30.h }, p0, [x12, x13, LSL #1]\n" // Store output point (0, 1)
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "fmax z27.h, p3/M, z27.h, z18.h\n"
+    "st1h { z29.h }, p0, [x12, x24, LSL #1]\n" // Store output point (0, 2)
+    "fmax z26.h, p3/M, z26.h, z18.h\n"
+    "fmax z25.h, p3/M, z25.h, z18.h\n"
+    "fmax z24.h, p3/M, z24.h, z18.h\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z28.h }, p0, [x23]\n" // Store output point (1, 0)
+    "fmin z27.h, p3/M, z27.h, z17.h\n"
+    "fmin z26.h, p3/M, z26.h, z17.h\n"
+    "st1h { z27.h }, p0, [x23, x13, LSL #1]\n" // Store output point (1, 1)
+    "fmin z25.h, p3/M, z25.h, z17.h\n"
+    "fmin z24.h, p3/M, z24.h, z17.h\n"
+    "st1h { z26.h }, p0, [x23, x24, LSL #1]\n" // Store output point (1, 2)
+    "fmax z23.h, p3/M, z23.h, z18.h\n"
+    "st1h { z25.h }, p0, [x22]\n" // Store output point (2, 0)
+    "fmin z23.h, p3/M, z23.h, z17.h\n"
+    "st1h { z24.h }, p0, [x22, x13, LSL #1]\n" // Store output point (2, 1)
+    "st1h { z23.h }, p0, [x22, x24, LSL #1]\n" // Store output point (2, 2)
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..65ecb6d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x6, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mov x17, #0x0\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cnth x16\n"
+    "ld1h { z16.h }, p3/Z, [x7]\n" // Load from weights and bias
+    "mov z31.d, z16.d\n"
+    "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n" // Load from weights and bias
+    "sub x15, XZR, x16\n"
+    "mov z30.d, z16.d\n"
+    "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n" // Load from weights and bias
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "mov z29.d, z16.d\n"
+    "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n" // Load from weights and bias
+    "cmp x16, %x[n_channels]\n"
+    "mov z28.d, z16.d\n"
+    "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n" // Load from weights and bias
+    "mov z27.d, z16.d\n"
+    "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n" // Load from weights and bias
+    "mov z26.d, z16.d\n"
+    "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n" // Load from weights and bias
+    "mov z25.d, z16.d\n"
+    "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x7, x7, #16\n"
+    "mov z24.d, z16.d\n"
+    "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n" // Load from weights and bias
+    "mov z23.d, z16.d\n"
+    "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x7, x7, #-6\n"
+    "ldp x14, x13, [x8, #0x0]\n"
+    "ldp x12, x11, [x8, #0x10]\n"
+    "ldr x10, [x8, #0x20]\n"
+    "ld1h { z9.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "ld1h { z11.h }, p2/Z, [x12, x17, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x10, x17, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x9, [x8, #0x28]\n"
+    "whilelt p1.h, x16, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z7.h, z9.h\n"
+    "ldr x28, [x8, #0x30]\n"
+    "inch x15\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x27, [x8, #0x38]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z5.h, z9.h\n"
+    "ldr x26, [x8, #0x40]\n"
+    "fmla z27.h, p3/M, z4.h, z9.h\n"
+    "ldr x22, [x8, #0x48]\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "ldr x21, [x8, #0x50]\n"
+    "fmla z25.h, p3/M, z2.h, z9.h\n"
+    "ldr x20, [x8, #0x58]\n"
+    "fmla z24.h, p3/M, z1.h, z9.h\n"
+    "ldr x19, [x8, #0x60]\n"
+    "fmla z23.h, p3/M, z0.h, z9.h\n"
+    "ldr x25, [x8, #0x68]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x22, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x17, LSL #1]\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "ldr x24, [x8, #0x70]\n"
+    "fmla z31.h, p3/M, z5.h, z13.h\n"
+    "ldr x23, [x8, #0x78]\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ldr x14, [x8, #0x80]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ldr x13, [x8, #0x88]\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "ldr x12, [x8, #0x90]\n"
+    "fmla z26.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x27, x17, LSL #1]\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ldr x11, [x8, #0x98]\n"
+    "fmla z30.h, p3/M, z6.h, z11.h\n"
+    "ldr x10, [x8, #0xa0]\n"
+    "fmla z28.h, p3/M, z4.h, z11.h\n"
+    "ldr x9, [x8, #0xa8]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "ldr x28, [x8, #0xb0]\n"
+    "fmla z25.h, p3/M, z1.h, z11.h\n"
+    "ldr x27, [x8, #0xb8]\n"
+    "fmla z24.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "ldr x26, [x8, #0xc0]\n"
+    "fmla z30.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x20, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ldr x22, [x6, #0x0]\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "ldr x21, [x6, #0x8]\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "ldr x20, [x6, #0x10]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x19, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ldr x19, [x6, #0x18]\n"
+    "fmla z24.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z16.h }, p3/Z, [x7]\n" // Load from weights and bias
+    "fmla z23.h, p3/M, z1.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z13.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x23, x17, LSL #1]\n"
+    "fmla z25.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z26.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z24.h, p3/M, z4.h, z10.h\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z13.h\n"
+    "fmla z24.h, p3/M, z6.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x17, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ldp x14, x13, [x8, #0x0]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z9.h }, p1/Z, [x14, x16, LSL #1]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z10.h }, p1/Z, [x13, x16, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ldp x12, x11, [x8, #0x10]\n"
+    "fmla z26.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x17, LSL #1]\n"
+    "fmla z24.h, p3/M, z8.h, z13.h\n"
+    "ldr x10, [x8, #0x20]\n"
+    "fmla z23.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z4.h, z12.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z7.h, z13.h\n"
+    "fmla z24.h, p3/M, z5.h, z13.h\n"
+    "fmla z23.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
+    "inch x17\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n" // Load from weights and bias
+    "whilelt p2.h, x17, %x[n_channels]\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z25.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x11, x16, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x12, x16, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z13.h\n"
+    "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x10, x16, LSL #1]\n"
+    "inch x16\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x7, x7, #16\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n" // Load from weights and bias
+    "cmp x16, %x[n_channels]\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x7, x7, #-6\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "st1h { z31.h }, p0, [x22, x15, LSL #1]\n"
+    "mov z31.d, z16.d\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "ldr x22, [x6, #0x20]\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z30.h }, p0, [x21, x15, LSL #1]\n"
+    "mov z30.d, z16.d\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z29.h }, p0, [x20, x15, LSL #1]\n"
+    "mov z29.d, z16.d\n"
+    "ldr x21, [x6, #0x28]\n"
+    "fmax z27.h, p3/M, z27.h, z18.h\n"
+    "ldr x20, [x6, #0x30]\n"
+    "fmax z26.h, p3/M, z26.h, z18.h\n"
+    "st1h { z28.h }, p0, [x19, x15, LSL #1]\n"
+    "mov z28.d, z16.d\n"
+    "ldr x19, [x6, #0x38]\n"
+    "fmax z25.h, p3/M, z25.h, z18.h\n"
+    "fmin z27.h, p3/M, z27.h, z17.h\n"
+    "st1h { z27.h }, p0, [x22, x15, LSL #1]\n"
+    "mov z27.d, z16.d\n"
+    "fmin z26.h, p3/M, z26.h, z17.h\n"
+    "ldr x22, [x6, #0x40]\n"
+    "fmin z25.h, p3/M, z25.h, z17.h\n"
+    "st1h { z26.h }, p0, [x21, x15, LSL #1]\n"
+    "mov z26.d, z16.d\n"
+    "fmax z24.h, p3/M, z24.h, z18.h\n"
+    "st1h { z25.h }, p0, [x20, x15, LSL #1]\n"
+    "mov z25.d, z16.d\n"
+    "fmax z23.h, p3/M, z23.h, z18.h\n"
+    "fmin z24.h, p3/M, z24.h, z17.h\n"
+    "st1h { z24.h }, p0, [x19, x15, LSL #1]\n"
+    "mov z24.d, z16.d\n"
+    "fmin z23.h, p3/M, z23.h, z17.h\n"
+    "st1h { z23.h }, p0, [x22, x15, LSL #1]\n"
+    "mov z23.d, z16.d\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x9, [x8, #0x28]\n"
+    "inch x15\n"
+    "fmla z30.h, p3/M, z7.h, z9.h\n"
+    "ldr x28, [x8, #0x30]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x27, [x8, #0x38]\n"
+    "fmla z28.h, p3/M, z5.h, z9.h\n"
+    "ldr x26, [x8, #0x40]\n"
+    "fmla z27.h, p3/M, z4.h, z9.h\n"
+    "ldr x22, [x8, #0x48]\n"
+    "fmla z26.h, p3/M, z3.h, z9.h\n"
+    "ldr x21, [x8, #0x50]\n"
+    "fmla z25.h, p3/M, z2.h, z9.h\n"
+    "ldr x20, [x8, #0x58]\n"
+    "fmla z24.h, p3/M, z1.h, z9.h\n"
+    "ldr x19, [x8, #0x60]\n"
+    "fmla z23.h, p3/M, z0.h, z9.h\n"
+    "ldr x25, [x8, #0x68]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x22, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x17, LSL #1]\n"
+    "fmla z25.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z13.h\n"
+    "ldr x24, [x8, #0x70]\n"
+    "fmla z31.h, p3/M, z5.h, z13.h\n"
+    "ldr x23, [x8, #0x78]\n"
+    "fmla z29.h, p3/M, z3.h, z13.h\n"
+    "ldr x14, [x8, #0x80]\n"
+    "fmla z28.h, p3/M, z2.h, z13.h\n"
+    "ldr x13, [x8, #0x88]\n"
+    "fmla z27.h, p3/M, z1.h, z13.h\n"
+    "ldr x12, [x8, #0x90]\n"
+    "fmla z26.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x27, x17, LSL #1]\n"
+    "fmla z23.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z7.h, z11.h\n"
+    "ldr x11, [x8, #0x98]\n"
+    "fmla z30.h, p3/M, z6.h, z11.h\n"
+    "ldr x10, [x8, #0xa0]\n"
+    "fmla z28.h, p3/M, z4.h, z11.h\n"
+    "ldr x9, [x8, #0xa8]\n"
+    "fmla z27.h, p3/M, z3.h, z11.h\n"
+    "ldr x28, [x8, #0xb0]\n"
+    "fmla z25.h, p3/M, z1.h, z11.h\n"
+    "ldr x27, [x8, #0xb8]\n"
+    "fmla z24.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "ldr x26, [x8, #0xc0]\n"
+    "fmla z30.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x20, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ldr x22, [x6, #0x0]\n"
+    "fmla z27.h, p3/M, z5.h, z10.h\n"
+    "ldr x21, [x6, #0x8]\n"
+    "fmla z26.h, p3/M, z4.h, z10.h\n"
+    "ldr x20, [x6, #0x10]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x19, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z7.h, z10.h\n"
+    "ldr x19, [x6, #0x18]\n"
+    "fmla z24.h, p3/M, z2.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z10.h\n"
+    "fmla z30.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z13.h\n"
+    "fmla z26.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x23, x17, LSL #1]\n"
+    "fmla z25.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z7.h, z10.h\n"
+    "fmla z26.h, p3/M, z6.h, z10.h\n"
+    "fmla z25.h, p3/M, z5.h, z10.h\n"
+    "fmla z28.h, p3/M, z8.h, z10.h\n"
+    "fmla z24.h, p3/M, z4.h, z10.h\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "fmla z26.h, p3/M, z8.h, z11.h\n"
+    "fmla z25.h, p3/M, z7.h, z13.h\n"
+    "fmla z24.h, p3/M, z6.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x17, LSL #1]\n"
+    "fmla z23.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z27.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "fmla z30.h, p3/M, z5.h, z11.h\n"
+    "fmla z26.h, p3/M, z1.h, z11.h\n"
+    "fmla z27.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x17, LSL #1]\n"
+    "fmla z24.h, p3/M, z8.h, z13.h\n"
+    "fmla z23.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x9, x17, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z27.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z4.h, z12.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z11.h\n"
+    "fmla z30.h, p3/M, z1.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x17, LSL #1]\n"
+    "fmla z27.h, p3/M, z8.h, z13.h\n"
+    "fmla z26.h, p3/M, z7.h, z13.h\n"
+    "fmla z24.h, p3/M, z5.h, z13.h\n"
+    "fmla z23.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n"
+    "fmla z31.h, p3/M, z6.h, z12.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "fmla z25.h, p3/M, z0.h, z12.h\n"
+    "fmla z29.h, p3/M, z8.h, z11.h\n"
+    "fmla z26.h, p3/M, z5.h, z11.h\n"
+    "fmla z23.h, p3/M, z2.h, z11.h\n"
+    "fmla z25.h, p3/M, z8.h, z13.h\n"
+    "fmla z24.h, p3/M, z7.h, z13.h\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "fmla z23.h, p3/M, z6.h, z13.h\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x22, x15, LSL #1]\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "ldr x22, [x6, #0x20]\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "st1h { z30.h }, p0, [x21, x15, LSL #1]\n"
+    "fmax z27.h, p3/M, z27.h, z18.h\n"
+    "fmax z26.h, p3/M, z26.h, z18.h\n"
+    "st1h { z29.h }, p0, [x20, x15, LSL #1]\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "ldr x21, [x6, #0x28]\n"
+    "fmax z25.h, p3/M, z25.h, z18.h\n"
+    "ldr x20, [x6, #0x30]\n"
+    "fmax z24.h, p3/M, z24.h, z18.h\n"
+    "st1h { z28.h }, p0, [x19, x15, LSL #1]\n"
+    "fmin z27.h, p3/M, z27.h, z17.h\n"
+    "fmin z26.h, p3/M, z26.h, z17.h\n"
+    "ldr x19, [x6, #0x38]\n"
+    "fmin z25.h, p3/M, z25.h, z17.h\n"
+    "st1h { z27.h }, p0, [x22, x15, LSL #1]\n"
+    "fmin z24.h, p3/M, z24.h, z17.h\n"
+    "fmax z23.h, p3/M, z23.h, z18.h\n"
+    "st1h { z26.h }, p0, [x21, x15, LSL #1]\n"
+    "st1h { z25.h }, p0, [x20, x15, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z17.h\n"
+    "st1h { z24.h }, p0, [x19, x15, LSL #1]\n"
+    "ldr x22, [x6, #0x40]\n"
+    "st1h { z23.h }, p0, [x22, x15, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000..f976842
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..8f0fce7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x24, #0x4\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x23, #0x4\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mov x5, #0x0\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cnth x6\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "sub x21, XZR, x6\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x2, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x3, x7, x19\n" // offset += tile_j * ld_input_col
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
+    "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x8, x8, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x15, x8, x22, LSL #1\n"
+    "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x14, x15, x22, LSL #1\n"
+    "ld1h { z13.h }, p3/Z, [x4]\n" // Load from weights and bias
+    "mov z31.d, z13.d\n"
+    "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias
+    "add x13, x14, x22, LSL #1\n"
+    "mov z30.d, z13.d\n"
+    "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias
+    "add x12, x13, x22, LSL #1\n"
+    "mov z29.d, z13.d\n"
+    "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias
+    "add x11, x12, x22, LSL #1\n"
+    "mov z28.d, z13.d\n"
+    "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias
+    "add x10, x7, x7\n"
+    "mov z27.d, z13.d\n"
+    "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias
+    "add x9, x10, x7\n"
+    "mov z26.d, z13.d\n"
+    "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias
+    "add x28, x9, x7\n"
+    "mov z25.d, z13.d\n"
+    "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias
+    "add x27, x28, x7\n"
+    "mov z24.d, z13.d\n"
+    "mul x19, x2, x20\n" // offset = tile_i * ld_output_row
+    "mov z23.d, z13.d\n"
+    "madd x19, x3, x17, x19\n" // offset += tile_j * ld_output_col
+    "mov z22.d, z13.d\n"
+    "mul x19, x19, x23\n" // offset *= output_tile_size
+    "mov z21.d, z13.d\n"
+    "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "mov z20.d, z13.d\n"
+    "add x26, x16, x20, LSL #1\n"
+    "mov z19.d, z13.d\n"
+    "add x25, x26, x20, LSL #1\n"
+    "mov z18.d, z13.d\n"
+    "add x24, x25, x20, LSL #1\n"
+    "mov z17.d, z13.d\n"
+    "add x23, x17, x17\n"
+    "mov z16.d, z13.d\n"
+    "add x22, x23, x17\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z9.h }, p2/Z, [x14, x10, LSL #1]\n" // Load input point (2, 2)
+    "ld1h { z10.h }, p2/Z, [x8]\n" // Load input point (0, 0)
+    "addvl x4, x4, #16\n"
+    "ld1h { z11.h }, p2/Z, [x8, x27, LSL #1]\n" // Load input point (0, 5)
+    "cmp x6, %x[n_channels]\n"
+    "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias
+    "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x4, x4, #-6\n"
+    "ld1h { z12.h }, p2/Z, [x14, x9, LSL #1]\n" // Load input point (2, 3)
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ld1h { z13.h }, p3/Z, [x4]\n" // Load from weights and bias
+    "whilelt p1.h, x6, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z7.h, z9.h\n"
+    "inch x21\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.h, p3/M, z5.h, z9.h\n"
+    "inch x5\n"
+    "fmla z26.h, p3/M, z4.h, z9.h\n"
+    "inch x6\n"
+    "fmla z25.h, p3/M, z3.h, z9.h\n"
+    "fmla z23.h, p3/M, z2.h, z9.h\n"
+    "fmla z22.h, p3/M, z1.h, z9.h\n"
+    "fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x10, LSL #1]\n" // Load input point (3, 2)
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11]\n" // Load input point (5, 0)
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (5, 5)
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z26.h, p3/M, z5.h, z12.h\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z4.h, z12.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "fmla z20.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x7, LSL #1]\n" // Load input point (0, 1)
+    "fmla z19.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x9, LSL #1]\n" // Load input point (3, 3)
+    "fmla z16.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x28, LSL #1]\n" // Load input point (0, 4)
+    "fmla z27.h, p3/M, z8.h, z9.h\n"
+    "fmla z26.h, p3/M, z7.h, z9.h\n"
+    "fmla z25.h, p3/M, z6.h, z9.h\n"
+    "fmla z23.h, p3/M, z5.h, z9.h\n"
+    "fmla z22.h, p3/M, z4.h, z9.h\n"
+    "fmla z21.h, p3/M, z3.h, z9.h\n"
+    "fmla z19.h, p3/M, z2.h, z9.h\n"
+    "fmla z18.h, p3/M, z1.h, z9.h\n"
+    "fmla z17.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x15]\n" // Load input point (1, 0)
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x27, LSL #1]\n" // Load input point (1, 5)
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12]\n" // Load input point (4, 0)
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "fmla z24.h, p3/M, z6.h, z10.h\n"
+    "fmla z22.h, p3/M, z5.h, z10.h\n"
+    "fmla z21.h, p3/M, z4.h, z10.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z18.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x10, LSL #1]\n" // Load input point (1, 2)
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "fmla z27.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x9, LSL #1]\n" // Load input point (1, 3)
+    "fmla z23.h, p3/M, z6.h, z11.h\n"
+    "fmla z19.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n" // Load input point (4, 5)
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n" // Load input point (2, 1)
+    "fmla z20.h, p3/M, z8.h, z11.h\n"
+    "fmla z16.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n" // Load input point (5, 1)
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "fmla z24.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x28, LSL #1]\n" // Load input point (2, 4)
+    "fmla z19.h, p3/M, z7.h, z11.h\n"
+    "fmla z18.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x28, LSL #1]\n" // Load input point (5, 4)
+    "fmla z31.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z10.h\n"
+    "fmla z26.h, p3/M, z3.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z10.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x8, x10, LSL #1]\n" // Load input point (0, 2)
+    "fmla z17.h, p3/M, z8.h, z11.h\n"
+    "fmla z16.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x7, LSL #1]\n" // Load input point (3, 1)
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z5.h, z12.h\n"
+    "fmla z24.h, p3/M, z4.h, z12.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x9, LSL #1]\n" // Load input point (0, 3)
+    "addvl x8, x8, #1\n"
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z29.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (2, 0)
+    "fmla z27.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z4.h, z11.h\n"
+    "fmla z22.h, p3/M, z3.h, z11.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "fmla z18.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x28, LSL #1]\n" // Load input point (3, 4)
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (2, 5)
+    "addvl x14, x14, #1\n"
+    "fmla z31.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z9.h }, p1/Z, [x14, x10, LSL #1]\n" // Load input point (2, 2)
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13]\n" // Load input point (3, 0)
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z24.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z5.h, z11.h\n"
+    "fmla z20.h, p3/M, z4.h, z11.h\n"
+    "fmla z17.h, p3/M, z2.h, z11.h\n"
+    "fmla z16.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x10, LSL #1]\n" // Load input point (4, 2)
+    "fmla z28.h, p3/M, z8.h, z12.h\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z20.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n" // Load input point (3, 5)
+    "addvl x13, x13, #1\n"
+    "fmla z27.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "fmla z19.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x10, LSL #1]\n" // Load input point (5, 2)
+    "fmla z22.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z8.h, z11.h\n"
+    "fmla z19.h, p3/M, z5.h, z11.h\n"
+    "fmla z18.h, p3/M, z4.h, z11.h\n"
+    "fmla z17.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x9, LSL #1]\n" // Load input point (4, 3)
+    "fmla z24.h, p3/M, z8.h, z12.h\n"
+    "fmla z20.h, p3/M, z5.h, z12.h\n"
+    "fmla z16.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x9, LSL #1]\n" // Load input point (5, 3)
+    "addvl x11, x11, #1\n"
+    "fmla z19.h, p3/M, z8.h, z10.h\n"
+    "fmla z18.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x7, LSL #1]\n" // Load input point (1, 1)
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "fmla z21.h, p3/M, z7.h, z11.h\n"
+    "fmla z20.h, p3/M, z6.h, z11.h\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z17.h, p3/M, z4.h, z11.h\n"
+    "fmla z16.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x28, LSL #1]\n" // Load input point (1, 4)
+    "addvl x15, x15, #1\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z17.h, p3/M, z7.h, z12.h\n"
+    "fmla z16.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x7, LSL #1]\n" // Load input point (4, 1)
+    "fmla z30.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z1.h, z10.h\n"
+    "fmla z26.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x28, LSL #1]\n" // Load input point (4, 4)
+    "whilelt p2.h, x5, %x[n_channels]\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias
+    "addvl x12, x12, #1\n"
+    "fmla z28.h, p3/M, z4.h, z11.h\n"
+    "cmp x6, %x[n_channels]\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n" // Load input point (0, 5)
+    "fmla z23.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z22.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias
+    "fmla z19.h, p3/M, z4.h, z12.h\n"
+    "fmla z18.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x14, x9, LSL #1]\n" // Load input point (2, 3)
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z20.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z5.h, z10.h\n"
+    "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x8]\n" // Load input point (0, 0)
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias
+    "addvl x4, x4, #16\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias
+    "fmax z29.h, p3/M, z29.h, z15.h\n"
+    "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x4, x4, #-6\n"
+    "fmin z31.h, p3/M, z31.h, z14.h\n"
+    "st1h { z31.h }, p0, [x16]\n" // Store output point (0, 0)
+    "mov z31.d, z13.d\n"
+    "fmin z30.h, p3/M, z30.h, z14.h\n"
+    "st1h { z30.h }, p0, [x16, x17, LSL #1]\n" // Store output point (0, 1)
+    "mov z30.d, z13.d\n"
+    "fmin z29.h, p3/M, z29.h, z14.h\n"
+    "st1h { z29.h }, p0, [x16, x23, LSL #1]\n" // Store output point (0, 2)
+    "mov z29.d, z13.d\n"
+    "fmax z28.h, p3/M, z28.h, z15.h\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "fmax z25.h, p3/M, z25.h, z15.h\n"
+    "fmin z28.h, p3/M, z28.h, z14.h\n"
+    "st1h { z28.h }, p0, [x16, x22, LSL #1]\n" // Store output point (0, 3)
+    "mov z28.d, z13.d\n"
+    "addvl x16, x16, #1\n"
+    "fmin z27.h, p3/M, z27.h, z14.h\n"
+    "st1h { z27.h }, p0, [x26]\n" // Store output point (1, 0)
+    "mov z27.d, z13.d\n"
+    "fmin z26.h, p3/M, z26.h, z14.h\n"
+    "st1h { z26.h }, p0, [x26, x17, LSL #1]\n" // Store output point (1, 1)
+    "mov z26.d, z13.d\n"
+    "fmin z25.h, p3/M, z25.h, z14.h\n"
+    "st1h { z25.h }, p0, [x26, x23, LSL #1]\n" // Store output point (1, 2)
+    "mov z25.d, z13.d\n"
+    "fmax z24.h, p3/M, z24.h, z15.h\n"
+    "fmax z23.h, p3/M, z23.h, z15.h\n"
+    "fmax z22.h, p3/M, z22.h, z15.h\n"
+    "fmax z21.h, p3/M, z21.h, z15.h\n"
+    "fmin z24.h, p3/M, z24.h, z14.h\n"
+    "st1h { z24.h }, p0, [x26, x22, LSL #1]\n" // Store output point (1, 3)
+    "mov z24.d, z13.d\n"
+    "addvl x26, x26, #1\n"
+    "fmin z23.h, p3/M, z23.h, z14.h\n"
+    "st1h { z23.h }, p0, [x25]\n" // Store output point (2, 0)
+    "mov z23.d, z13.d\n"
+    "fmin z22.h, p3/M, z22.h, z14.h\n"
+    "st1h { z22.h }, p0, [x25, x17, LSL #1]\n" // Store output point (2, 1)
+    "mov z22.d, z13.d\n"
+    "fmin z21.h, p3/M, z21.h, z14.h\n"
+    "st1h { z21.h }, p0, [x25, x23, LSL #1]\n" // Store output point (2, 2)
+    "mov z21.d, z13.d\n"
+    "fmax z20.h, p3/M, z20.h, z15.h\n"
+    "fmax z19.h, p3/M, z19.h, z15.h\n"
+    "fmax z18.h, p3/M, z18.h, z15.h\n"
+    "fmax z17.h, p3/M, z17.h, z15.h\n"
+    "fmin z20.h, p3/M, z20.h, z14.h\n"
+    "st1h { z20.h }, p0, [x25, x22, LSL #1]\n" // Store output point (2, 3)
+    "mov z20.d, z13.d\n"
+    "addvl x25, x25, #1\n"
+    "fmin z19.h, p3/M, z19.h, z14.h\n"
+    "st1h { z19.h }, p0, [x24]\n" // Store output point (3, 0)
+    "mov z19.d, z13.d\n"
+    "fmin z18.h, p3/M, z18.h, z14.h\n"
+    "st1h { z18.h }, p0, [x24, x17, LSL #1]\n" // Store output point (3, 1)
+    "mov z18.d, z13.d\n"
+    "fmin z17.h, p3/M, z17.h, z14.h\n"
+    "st1h { z17.h }, p0, [x24, x23, LSL #1]\n" // Store output point (3, 2)
+    "mov z17.d, z13.d\n"
+    "fmax z16.h, p3/M, z16.h, z15.h\n"
+    "fmin z16.h, p3/M, z16.h, z14.h\n"
+    "st1h { z16.h }, p0, [x24, x22, LSL #1]\n" // Store output point (3, 3)
+    "mov z16.d, z13.d\n"
+    "addvl x24, x24, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.h, p3/M, z7.h, z9.h\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x21, x2, #0x1\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z27.h, p3/M, z5.h, z9.h\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z26.h, p3/M, z4.h, z9.h\n"
+    "cmp x3, x19\n"
+    "fmla z25.h, p3/M, z3.h, z9.h\n"
+    "fmla z23.h, p3/M, z2.h, z9.h\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z22.h, p3/M, z1.h, z9.h\n"
+    "csel x2, x2, x21, LT\n"
+    "fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x10, LSL #1]\n" // Load input point (3, 2)
+    "cmp x2, x20\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11]\n" // Load input point (5, 0)
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (5, 5)
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "fmla z26.h, p3/M, z5.h, z12.h\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "fmla z25.h, p3/M, z4.h, z12.h\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "fmla z20.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x7, LSL #1]\n" // Load input point (0, 1)
+    "fmla z19.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13, x9, LSL #1]\n" // Load input point (3, 3)
+    "fmla z16.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x28, LSL #1]\n" // Load input point (0, 4)
+    "fmla z27.h, p3/M, z8.h, z9.h\n"
+    "fmla z26.h, p3/M, z7.h, z9.h\n"
+    "fmla z25.h, p3/M, z6.h, z9.h\n"
+    "fmla z23.h, p3/M, z5.h, z9.h\n"
+    "fmla z22.h, p3/M, z4.h, z9.h\n"
+    "fmla z21.h, p3/M, z3.h, z9.h\n"
+    "fmla z19.h, p3/M, z2.h, z9.h\n"
+    "fmla z18.h, p3/M, z1.h, z9.h\n"
+    "fmla z17.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x15]\n" // Load input point (1, 0)
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x27, LSL #1]\n" // Load input point (1, 5)
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12]\n" // Load input point (4, 0)
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "fmla z24.h, p3/M, z6.h, z10.h\n"
+    "fmla z22.h, p3/M, z5.h, z10.h\n"
+    "fmla z21.h, p3/M, z4.h, z10.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z18.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x10, LSL #1]\n" // Load input point (1, 2)
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "fmla z27.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x9, LSL #1]\n" // Load input point (1, 3)
+    "fmla z23.h, p3/M, z6.h, z11.h\n"
+    "fmla z19.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n" // Load input point (4, 5)
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n" // Load input point (2, 1)
+    "fmla z20.h, p3/M, z8.h, z11.h\n"
+    "fmla z16.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n" // Load input point (5, 1)
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "fmla z24.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x28, LSL #1]\n" // Load input point (2, 4)
+    "fmla z19.h, p3/M, z7.h, z11.h\n"
+    "fmla z18.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x28, LSL #1]\n" // Load input point (5, 4)
+    "fmla z31.h, p3/M, z7.h, z10.h\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z10.h\n"
+    "fmla z26.h, p3/M, z3.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z10.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x8, x10, LSL #1]\n" // Load input point (0, 2)
+    "fmla z17.h, p3/M, z8.h, z11.h\n"
+    "fmla z16.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x7, LSL #1]\n" // Load input point (3, 1)
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z5.h, z12.h\n"
+    "fmla z24.h, p3/M, z4.h, z12.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x8, x9, LSL #1]\n" // Load input point (0, 3)
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z29.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (2, 0)
+    "fmla z27.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z4.h, z11.h\n"
+    "fmla z22.h, p3/M, z3.h, z11.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "fmla z18.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x28, LSL #1]\n" // Load input point (3, 4)
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (2, 5)
+    "fmla z31.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x13]\n" // Load input point (3, 0)
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z24.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z5.h, z11.h\n"
+    "fmla z20.h, p3/M, z4.h, z11.h\n"
+    "fmla z17.h, p3/M, z2.h, z11.h\n"
+    "fmla z16.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x10, LSL #1]\n" // Load input point (4, 2)
+    "fmla z28.h, p3/M, z8.h, z12.h\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z20.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n" // Load input point (3, 5)
+    "fmla z27.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "fmla z19.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x10, LSL #1]\n" // Load input point (5, 2)
+    "fmla z22.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z8.h, z11.h\n"
+    "fmla z19.h, p3/M, z5.h, z11.h\n"
+    "fmla z18.h, p3/M, z4.h, z11.h\n"
+    "fmla z17.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x12, x9, LSL #1]\n" // Load input point (4, 3)
+    "fmla z24.h, p3/M, z8.h, z12.h\n"
+    "fmla z20.h, p3/M, z5.h, z12.h\n"
+    "fmla z16.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x9, LSL #1]\n" // Load input point (5, 3)
+    "fmla z19.h, p3/M, z8.h, z10.h\n"
+    "fmla z18.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x7, LSL #1]\n" // Load input point (1, 1)
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "fmla z21.h, p3/M, z7.h, z11.h\n"
+    "fmla z20.h, p3/M, z6.h, z11.h\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z17.h, p3/M, z4.h, z11.h\n"
+    "fmla z16.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x15, x28, LSL #1]\n" // Load input point (1, 4)
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z17.h, p3/M, z7.h, z12.h\n"
+    "fmla z16.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x7, LSL #1]\n" // Load input point (4, 1)
+    "fmla z30.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z1.h, z10.h\n"
+    "fmla z26.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x12, x28, LSL #1]\n" // Load input point (4, 4)
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z28.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z23.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z6.h, z12.h\n"
+    "fmla z19.h, p3/M, z4.h, z12.h\n"
+    "fmla z18.h, p3/M, z3.h, z12.h\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "fmla z20.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z5.h, z10.h\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmax z29.h, p3/M, z29.h, z15.h\n"
+    "fmax z28.h, p3/M, z28.h, z15.h\n"
+    "fmin z31.h, p3/M, z31.h, z14.h\n"
+    "st1h { z31.h }, p0, [x16]\n" // Store output point (0, 0)
+    "fmin z30.h, p3/M, z30.h, z14.h\n"
+    "fmin z29.h, p3/M, z29.h, z14.h\n"
+    "st1h { z30.h }, p0, [x16, x17, LSL #1]\n" // Store output point (0, 1)
+    "fmin z28.h, p3/M, z28.h, z14.h\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "st1h { z29.h }, p0, [x16, x23, LSL #1]\n" // Store output point (0, 2)
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "st1h { z28.h }, p0, [x16, x22, LSL #1]\n" // Store output point (0, 3)
+    "fmin z27.h, p3/M, z27.h, z14.h\n"
+    "fmax z25.h, p3/M, z25.h, z15.h\n"
+    "st1h { z27.h }, p0, [x26]\n" // Store output point (1, 0)
+    "fmin z26.h, p3/M, z26.h, z14.h\n"
+    "fmin z25.h, p3/M, z25.h, z14.h\n"
+    "st1h { z26.h }, p0, [x26, x17, LSL #1]\n" // Store output point (1, 1)
+    "fmax z24.h, p3/M, z24.h, z15.h\n"
+    "fmax z23.h, p3/M, z23.h, z15.h\n"
+    "st1h { z25.h }, p0, [x26, x23, LSL #1]\n" // Store output point (1, 2)
+    "fmax z22.h, p3/M, z22.h, z15.h\n"
+    "fmax z21.h, p3/M, z21.h, z15.h\n"
+    "fmax z20.h, p3/M, z20.h, z15.h\n"
+    "fmin z24.h, p3/M, z24.h, z14.h\n"
+    "st1h { z24.h }, p0, [x26, x22, LSL #1]\n" // Store output point (1, 3)
+    "fmin z23.h, p3/M, z23.h, z14.h\n"
+    "fmin z22.h, p3/M, z22.h, z14.h\n"
+    "st1h { z23.h }, p0, [x25]\n" // Store output point (2, 0)
+    "fmin z21.h, p3/M, z21.h, z14.h\n"
+    "fmin z20.h, p3/M, z20.h, z14.h\n"
+    "st1h { z22.h }, p0, [x25, x17, LSL #1]\n" // Store output point (2, 1)
+    "fmax z19.h, p3/M, z19.h, z15.h\n"
+    "st1h { z21.h }, p0, [x25, x23, LSL #1]\n" // Store output point (2, 2)
+    "fmax z18.h, p3/M, z18.h, z15.h\n"
+    "fmax z17.h, p3/M, z17.h, z15.h\n"
+    "st1h { z20.h }, p0, [x25, x22, LSL #1]\n" // Store output point (2, 3)
+    "fmin z19.h, p3/M, z19.h, z14.h\n"
+    "st1h { z19.h }, p0, [x24]\n" // Store output point (3, 0)
+    "fmin z18.h, p3/M, z18.h, z14.h\n"
+    "fmin z17.h, p3/M, z17.h, z14.h\n"
+    "st1h { z18.h }, p0, [x24, x17, LSL #1]\n" // Store output point (3, 1)
+    "fmax z16.h, p3/M, z16.h, z15.h\n"
+    "st1h { z17.h }, p0, [x24, x23, LSL #1]\n" // Store output point (3, 2)
+    "fmin z16.h, p3/M, z16.h, z14.h\n"
+    "st1h { z16.h }, p0, [x24, x22, LSL #1]\n" // Store output point (3, 3)
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..8148353
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,746 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x2, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x4, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mov x5, #0x0\n"
+    "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cnth x6\n"
+    "ld1h { z13.h }, p3/Z, [x3]\n" // Load from weights and bias
+    "mov z31.d, z13.d\n"
+    "ld1h { z0.h }, p3/Z, [x3, #1, MUL VL]\n" // Load from weights and bias
+    "sub x7, XZR, x6\n"
+    "mov z30.d, z13.d\n"
+    "ld1h { z1.h }, p3/Z, [x3, #2, MUL VL]\n" // Load from weights and bias
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "mov z29.d, z13.d\n"
+    "ld1h { z2.h }, p3/Z, [x3, #3, MUL VL]\n" // Load from weights and bias
+    "cmp x6, %x[n_channels]\n"
+    "mov z28.d, z13.d\n"
+    "ld1h { z3.h }, p3/Z, [x3, #4, MUL VL]\n" // Load from weights and bias
+    "mov z27.d, z13.d\n"
+    "ld1h { z4.h }, p3/Z, [x3, #5, MUL VL]\n" // Load from weights and bias
+    "mov z26.d, z13.d\n"
+    "ld1h { z5.h }, p3/Z, [x3, #6, MUL VL]\n" // Load from weights and bias
+    "mov z25.d, z13.d\n"
+    "ld1h { z6.h }, p3/Z, [x3, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x3, x3, #16\n"
+    "mov z24.d, z13.d\n"
+    "ld1h { z7.h }, p3/Z, [x3, #-8, MUL VL]\n" // Load from weights and bias
+    "mov z23.d, z13.d\n"
+    "ld1h { z8.h }, p3/Z, [x3, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x3, x3, #-6\n"
+    "mov z22.d, z13.d\n"
+    "ldp x8, x17, [x4, #0x0]\n"
+    "mov z21.d, z13.d\n"
+    "ldp x16, x15, [x4, #0x10]\n"
+    "mov z20.d, z13.d\n"
+    "ld1h { z9.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "mov z19.d, z13.d\n"
+    "mov z18.d, z13.d\n"
+    "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n"
+    "mov z17.d, z13.d\n"
+    "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n"
+    "mov z16.d, z13.d\n"
+    "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x14, [x4, #0x20]\n"
+    "whilelt p1.h, x6, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z7.h, z9.h\n"
+    "ldr x13, [x4, #0x28]\n"
+    "inch x7\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x12, [x4, #0x30]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.h, p3/M, z5.h, z9.h\n"
+    "ldr x11, [x4, #0x38]\n"
+    "fmla z26.h, p3/M, z4.h, z9.h\n"
+    "ldr x10, [x4, #0x40]\n"
+    "fmla z25.h, p3/M, z3.h, z9.h\n"
+    "ldr x9, [x4, #0x48]\n"
+    "fmla z23.h, p3/M, z2.h, z9.h\n"
+    "ldr x28, [x4, #0x50]\n"
+    "fmla z22.h, p3/M, z1.h, z9.h\n"
+    "ldr x27, [x4, #0x58]\n"
+    "fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "ldr x26, [x4, #0x60]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "ldr x25, [x4, #0x68]\n"
+    "fmla z26.h, p3/M, z5.h, z12.h\n"
+    "ldr x24, [x4, #0x70]\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "ldr x23, [x4, #0x78]\n"
+    "fmla z25.h, p3/M, z4.h, z12.h\n"
+    "ldr x8, [x4, #0x80]\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "ldr x17, [x4, #0x88]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "ldr x16, [x4, #0x90]\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "ldr x15, [x4, #0x98]\n"
+    "fmla z20.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z19.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x9, x5, LSL #1]\n"
+    "fmla z16.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n"
+    "fmla z27.h, p3/M, z8.h, z9.h\n"
+    "ldr x14, [x4, #0xa0]\n"
+    "fmla z26.h, p3/M, z7.h, z9.h\n"
+    "ldr x13, [x4, #0xa8]\n"
+    "fmla z25.h, p3/M, z6.h, z9.h\n"
+    "ldr x12, [x4, #0xb0]\n"
+    "fmla z23.h, p3/M, z5.h, z9.h\n"
+    "ldr x11, [x4, #0xb8]\n"
+    "fmla z22.h, p3/M, z4.h, z9.h\n"
+    "ldr x10, [x4, #0xc0]\n"
+    "fmla z21.h, p3/M, z3.h, z9.h\n"
+    "ldr x9, [x4, #0xc8]\n"
+    "fmla z19.h, p3/M, z2.h, z9.h\n"
+    "ldr x22, [x2, #0x0]\n"
+    "fmla z18.h, p3/M, z1.h, z9.h\n"
+    "ldr x21, [x2, #0x8]\n"
+    "fmla z17.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ldr x28, [x4, #0xd0]\n"
+    "fmla z30.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ldr x27, [x4, #0xd8]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x5, LSL #1]\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "ldr x26, [x4, #0xe0]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "ldr x20, [x2, #0x10]\n"
+    "fmla z24.h, p3/M, z6.h, z10.h\n"
+    "ldr x19, [x2, #0x18]\n"
+    "fmla z22.h, p3/M, z5.h, z10.h\n"
+    "ld1h { z13.h }, p3/Z, [x3]\n" // Load from weights and bias
+    "fmla z21.h, p3/M, z4.h, z10.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z18.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "ldr x25, [x4, #0xe8]\n"
+    "fmla z27.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n"
+    "fmla z23.h, p3/M, z6.h, z11.h\n"
+    "ldr x23, [x4, #0xf8]\n"
+    "fmla z19.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "ldr x24, [x4, #0xf0]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n"
+    "fmla z20.h, p3/M, z8.h, z11.h\n"
+    "ldr x17, [x4, #0x108]\n"
+    "fmla z16.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ldr x8, [x4, #0x100]\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "fmla z24.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n"
+    "fmla z19.h, p3/M, z7.h, z11.h\n"
+    "ldr x15, [x4, #0x118]\n"
+    "fmla z18.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z7.h, z10.h\n"
+    "ldr x16, [x4, #0x110]\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z10.h\n"
+    "fmla z26.h, p3/M, z3.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z10.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n"
+    "fmla z17.h, p3/M, z8.h, z11.h\n"
+    "fmla z16.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z5.h, z12.h\n"
+    "fmla z24.h, p3/M, z4.h, z12.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z29.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z27.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z4.h, z11.h\n"
+    "fmla z22.h, p3/M, z3.h, z11.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "fmla z18.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z24.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z5.h, z11.h\n"
+    "fmla z20.h, p3/M, z4.h, z11.h\n"
+    "fmla z17.h, p3/M, z2.h, z11.h\n"
+    "fmla z16.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z12.h\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z20.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x5, LSL #1]\n"
+    "fmla z27.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "fmla z19.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z8.h, z11.h\n"
+    "fmla z19.h, p3/M, z5.h, z11.h\n"
+    "fmla z18.h, p3/M, z4.h, z11.h\n"
+    "fmla z17.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z24.h, p3/M, z8.h, z12.h\n"
+    "fmla z20.h, p3/M, z5.h, z12.h\n"
+    "fmla z16.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n"
+    "fmla z19.h, p3/M, z8.h, z10.h\n"
+    "fmla z18.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "fmla z21.h, p3/M, z7.h, z11.h\n"
+    "fmla z20.h, p3/M, z6.h, z11.h\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z17.h, p3/M, z4.h, z11.h\n"
+    "fmla z16.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x17, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "ldp x8, x17, [x4, #0x0]\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "ld1h { z9.h }, p1/Z, [x8, x6, LSL #1]\n"
+    "fmla z17.h, p3/M, z7.h, z12.h\n"
+    "fmla z16.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z1.h, z10.h\n"
+    "fmla z26.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x5, LSL #1]\n"
+    "inch x5\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "ldp x16, x15, [x4, #0x10]\n"
+    "whilelt p2.h, x5, %x[n_channels]\n"
+    "fmla z28.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x3, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x3, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x16, x6, LSL #1]\n"
+    "fmla z23.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x3, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z22.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z6.h }, p3/Z, [x3, #7, MUL VL]\n" // Load from weights and bias
+    "fmla z19.h, p3/M, z4.h, z12.h\n"
+    "fmla z18.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x15, x6, LSL #1]\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "ld1h { z3.h }, p3/Z, [x3, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z20.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z5.h, z10.h\n"
+    "ld1h { z5.h }, p3/Z, [x3, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x17, x6, LSL #1]\n"
+    "inch x6\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "ld1h { z4.h }, p3/Z, [x3, #5, MUL VL]\n" // Load from weights and bias
+    "addvl x3, x3, #16\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "ld1h { z7.h }, p3/Z, [x3, #-8, MUL VL]\n" // Load from weights and bias
+    "cmp x6, %x[n_channels]\n"
+    "fmax z29.h, p3/M, z29.h, z15.h\n"
+    "ld1h { z8.h }, p3/Z, [x3, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x3, x3, #-6\n"
+    "fmax z28.h, p3/M, z28.h, z15.h\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmin z31.h, p3/M, z31.h, z14.h\n"
+    "st1h { z31.h }, p0, [x22, x7, LSL #1]\n"
+    "mov z31.d, z13.d\n"
+    "fmin z30.h, p3/M, z30.h, z14.h\n"
+    "ldr x22, [x2, #0x20]\n"
+    "fmin z29.h, p3/M, z29.h, z14.h\n"
+    "st1h { z30.h }, p0, [x21, x7, LSL #1]\n"
+    "mov z30.d, z13.d\n"
+    "fmin z28.h, p3/M, z28.h, z14.h\n"
+    "st1h { z29.h }, p0, [x20, x7, LSL #1]\n"
+    "mov z29.d, z13.d\n"
+    "ldr x21, [x2, #0x28]\n"
+    "fmin z27.h, p3/M, z27.h, z14.h\n"
+    "ldr x20, [x2, #0x30]\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "st1h { z28.h }, p0, [x19, x7, LSL #1]\n"
+    "mov z28.d, z13.d\n"
+    "ldr x19, [x2, #0x38]\n"
+    "fmax z25.h, p3/M, z25.h, z15.h\n"
+    "st1h { z27.h }, p0, [x22, x7, LSL #1]\n"
+    "mov z27.d, z13.d\n"
+    "ldr x22, [x2, #0x40]\n"
+    "fmin z26.h, p3/M, z26.h, z14.h\n"
+    "st1h { z26.h }, p0, [x21, x7, LSL #1]\n"
+    "mov z26.d, z13.d\n"
+    "fmin z25.h, p3/M, z25.h, z14.h\n"
+    "ldr x21, [x2, #0x48]\n"
+    "fmax z24.h, p3/M, z24.h, z15.h\n"
+    "st1h { z25.h }, p0, [x20, x7, LSL #1]\n"
+    "mov z25.d, z13.d\n"
+    "fmax z23.h, p3/M, z23.h, z15.h\n"
+    "ldr x20, [x2, #0x50]\n"
+    "fmin z24.h, p3/M, z24.h, z14.h\n"
+    "st1h { z24.h }, p0, [x19, x7, LSL #1]\n"
+    "mov z24.d, z13.d\n"
+    "fmin z23.h, p3/M, z23.h, z14.h\n"
+    "ldr x19, [x2, #0x58]\n"
+    "fmax z22.h, p3/M, z22.h, z15.h\n"
+    "st1h { z23.h }, p0, [x22, x7, LSL #1]\n"
+    "mov z23.d, z13.d\n"
+    "fmax z21.h, p3/M, z21.h, z15.h\n"
+    "ldr x22, [x2, #0x60]\n"
+    "fmin z22.h, p3/M, z22.h, z14.h\n"
+    "st1h { z22.h }, p0, [x21, x7, LSL #1]\n"
+    "mov z22.d, z13.d\n"
+    "fmin z21.h, p3/M, z21.h, z14.h\n"
+    "ldr x21, [x2, #0x68]\n"
+    "fmax z20.h, p3/M, z20.h, z15.h\n"
+    "st1h { z21.h }, p0, [x20, x7, LSL #1]\n"
+    "mov z21.d, z13.d\n"
+    "fmax z19.h, p3/M, z19.h, z15.h\n"
+    "ldr x20, [x2, #0x70]\n"
+    "fmin z20.h, p3/M, z20.h, z14.h\n"
+    "st1h { z20.h }, p0, [x19, x7, LSL #1]\n"
+    "mov z20.d, z13.d\n"
+    "fmin z19.h, p3/M, z19.h, z14.h\n"
+    "ldr x19, [x2, #0x78]\n"
+    "fmax z18.h, p3/M, z18.h, z15.h\n"
+    "st1h { z19.h }, p0, [x22, x7, LSL #1]\n"
+    "mov z19.d, z13.d\n"
+    "fmax z17.h, p3/M, z17.h, z15.h\n"
+    "fmin z18.h, p3/M, z18.h, z14.h\n"
+    "st1h { z18.h }, p0, [x21, x7, LSL #1]\n"
+    "mov z18.d, z13.d\n"
+    "fmin z17.h, p3/M, z17.h, z14.h\n"
+    "st1h { z17.h }, p0, [x20, x7, LSL #1]\n"
+    "mov z17.d, z13.d\n"
+    "fmax z16.h, p3/M, z16.h, z15.h\n"
+    "fmin z16.h, p3/M, z16.h, z14.h\n"
+    "st1h { z16.h }, p0, [x19, x7, LSL #1]\n"
+    "mov z16.d, z13.d\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x14, [x4, #0x20]\n"
+    "inch x7\n"
+    "fmla z30.h, p3/M, z7.h, z9.h\n"
+    "ldr x13, [x4, #0x28]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.h, p3/M, z6.h, z9.h\n"
+    "ldr x12, [x4, #0x30]\n"
+    "fmla z27.h, p3/M, z5.h, z9.h\n"
+    "ldr x11, [x4, #0x38]\n"
+    "fmla z26.h, p3/M, z4.h, z9.h\n"
+    "ldr x10, [x4, #0x40]\n"
+    "fmla z25.h, p3/M, z3.h, z9.h\n"
+    "ldr x9, [x4, #0x48]\n"
+    "fmla z23.h, p3/M, z2.h, z9.h\n"
+    "ldr x28, [x4, #0x50]\n"
+    "fmla z22.h, p3/M, z1.h, z9.h\n"
+    "ldr x27, [x4, #0x58]\n"
+    "fmla z21.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z12.h\n"
+    "ldr x26, [x4, #0x60]\n"
+    "fmla z29.h, p3/M, z7.h, z12.h\n"
+    "ldr x25, [x4, #0x68]\n"
+    "fmla z26.h, p3/M, z5.h, z12.h\n"
+    "ldr x24, [x4, #0x70]\n"
+    "fmla z28.h, p3/M, z6.h, z12.h\n"
+    "ldr x23, [x4, #0x78]\n"
+    "fmla z25.h, p3/M, z4.h, z12.h\n"
+    "ldr x8, [x4, #0x80]\n"
+    "fmla z24.h, p3/M, z3.h, z12.h\n"
+    "ldr x17, [x4, #0x88]\n"
+    "fmla z22.h, p3/M, z2.h, z12.h\n"
+    "ldr x16, [x4, #0x90]\n"
+    "fmla z21.h, p3/M, z1.h, z12.h\n"
+    "ldr x15, [x4, #0x98]\n"
+    "fmla z20.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z19.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x9, x5, LSL #1]\n"
+    "fmla z16.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n"
+    "fmla z27.h, p3/M, z8.h, z9.h\n"
+    "ldr x14, [x4, #0xa0]\n"
+    "fmla z26.h, p3/M, z7.h, z9.h\n"
+    "ldr x13, [x4, #0xa8]\n"
+    "fmla z25.h, p3/M, z6.h, z9.h\n"
+    "ldr x12, [x4, #0xb0]\n"
+    "fmla z23.h, p3/M, z5.h, z9.h\n"
+    "ldr x11, [x4, #0xb8]\n"
+    "fmla z22.h, p3/M, z4.h, z9.h\n"
+    "ldr x10, [x4, #0xc0]\n"
+    "fmla z21.h, p3/M, z3.h, z9.h\n"
+    "ldr x9, [x4, #0xc8]\n"
+    "fmla z19.h, p3/M, z2.h, z9.h\n"
+    "ldr x22, [x2, #0x0]\n"
+    "fmla z18.h, p3/M, z1.h, z9.h\n"
+    "ldr x21, [x2, #0x8]\n"
+    "fmla z17.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z12.h\n"
+    "ldr x28, [x4, #0xd0]\n"
+    "fmla z30.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z2.h, z11.h\n"
+    "ldr x27, [x4, #0xd8]\n"
+    "fmla z28.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x5, LSL #1]\n"
+    "fmla z26.h, p3/M, z8.h, z10.h\n"
+    "ldr x26, [x4, #0xe0]\n"
+    "fmla z25.h, p3/M, z7.h, z10.h\n"
+    "ldr x20, [x2, #0x10]\n"
+    "fmla z24.h, p3/M, z6.h, z10.h\n"
+    "ldr x19, [x2, #0x18]\n"
+    "fmla z22.h, p3/M, z5.h, z10.h\n"
+    "fmla z21.h, p3/M, z4.h, z10.h\n"
+    "fmla z20.h, p3/M, z3.h, z10.h\n"
+    "fmla z18.h, p3/M, z2.h, z10.h\n"
+    "fmla z17.h, p3/M, z1.h, z10.h\n"
+    "fmla z16.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "ldr x25, [x4, #0xe8]\n"
+    "fmla z27.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z5.h, z12.h\n"
+    "fmla z24.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n"
+    "fmla z23.h, p3/M, z6.h, z11.h\n"
+    "ldr x23, [x4, #0xf8]\n"
+    "fmla z19.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z5.h, z10.h\n"
+    "ldr x24, [x4, #0xf0]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z2.h, z10.h\n"
+    "fmla z26.h, p3/M, z1.h, z10.h\n"
+    "fmla z25.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n"
+    "fmla z20.h, p3/M, z8.h, z11.h\n"
+    "ldr x17, [x4, #0x108]\n"
+    "fmla z16.h, p3/M, z5.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ldr x8, [x4, #0x100]\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "fmla z26.h, p3/M, z2.h, z12.h\n"
+    "fmla z25.h, p3/M, z1.h, z12.h\n"
+    "fmla z24.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n"
+    "fmla z19.h, p3/M, z7.h, z11.h\n"
+    "ldr x15, [x4, #0x118]\n"
+    "fmla z18.h, p3/M, z6.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z7.h, z10.h\n"
+    "ldr x16, [x4, #0x110]\n"
+    "fmla z30.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z4.h, z10.h\n"
+    "fmla z26.h, p3/M, z3.h, z10.h\n"
+    "fmla z23.h, p3/M, z1.h, z10.h\n"
+    "fmla z22.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n"
+    "fmla z17.h, p3/M, z8.h, z11.h\n"
+    "fmla z16.h, p3/M, z7.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z12.h\n"
+    "fmla z28.h, p3/M, z7.h, z12.h\n"
+    "fmla z25.h, p3/M, z5.h, z12.h\n"
+    "fmla z24.h, p3/M, z4.h, z12.h\n"
+    "fmla z21.h, p3/M, z2.h, z12.h\n"
+    "fmla z20.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z29.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x11, x5, LSL #1]\n"
+    "fmla z27.h, p3/M, z7.h, z11.h\n"
+    "fmla z26.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z4.h, z11.h\n"
+    "fmla z22.h, p3/M, z3.h, z11.h\n"
+    "fmla z19.h, p3/M, z1.h, z11.h\n"
+    "fmla z18.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z12.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "fmla z28.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z6.h, z10.h\n"
+    "fmla z27.h, p3/M, z3.h, z10.h\n"
+    "fmla z23.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n"
+    "fmla z25.h, p3/M, z8.h, z11.h\n"
+    "fmla z24.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z5.h, z11.h\n"
+    "fmla z20.h, p3/M, z4.h, z11.h\n"
+    "fmla z17.h, p3/M, z2.h, z11.h\n"
+    "fmla z16.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x5, LSL #1]\n"
+    "fmla z28.h, p3/M, z8.h, z12.h\n"
+    "fmla z24.h, p3/M, z5.h, z12.h\n"
+    "fmla z20.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x26, x5, LSL #1]\n"
+    "fmla z27.h, p3/M, z6.h, z10.h\n"
+    "fmla z23.h, p3/M, z3.h, z10.h\n"
+    "fmla z19.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n"
+    "fmla z22.h, p3/M, z7.h, z11.h\n"
+    "fmla z21.h, p3/M, z6.h, z11.h\n"
+    "fmla z23.h, p3/M, z8.h, z11.h\n"
+    "fmla z19.h, p3/M, z5.h, z11.h\n"
+    "fmla z18.h, p3/M, z4.h, z11.h\n"
+    "fmla z17.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n"
+    "fmla z24.h, p3/M, z8.h, z12.h\n"
+    "fmla z20.h, p3/M, z5.h, z12.h\n"
+    "fmla z16.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n"
+    "fmla z19.h, p3/M, z8.h, z10.h\n"
+    "fmla z18.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z6.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x8, x5, LSL #1]\n"
+    "fmla z22.h, p3/M, z8.h, z11.h\n"
+    "fmla z21.h, p3/M, z7.h, z11.h\n"
+    "fmla z20.h, p3/M, z6.h, z11.h\n"
+    "fmla z18.h, p3/M, z5.h, z11.h\n"
+    "fmla z17.h, p3/M, z4.h, z11.h\n"
+    "fmla z16.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x17, x5, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z10.h\n"
+    "fmla z18.h, p3/M, z8.h, z12.h\n"
+    "fmla z17.h, p3/M, z7.h, z12.h\n"
+    "fmla z16.h, p3/M, z6.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x16, x5, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z10.h\n"
+    "fmla z27.h, p3/M, z1.h, z10.h\n"
+    "fmla z26.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x15, x5, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z11.h\n"
+    "fmla z28.h, p3/M, z4.h, z11.h\n"
+    "fmla z25.h, p3/M, z2.h, z11.h\n"
+    "fmla z24.h, p3/M, z1.h, z11.h\n"
+    "fmla z23.h, p3/M, z7.h, z12.h\n"
+    "fmla z22.h, p3/M, z6.h, z12.h\n"
+    "fmla z19.h, p3/M, z4.h, z12.h\n"
+    "fmla z18.h, p3/M, z3.h, z12.h\n"
+    "fmla z21.h, p3/M, z8.h, z10.h\n"
+    "fmla z20.h, p3/M, z7.h, z10.h\n"
+    "fmla z17.h, p3/M, z5.h, z10.h\n"
+    "fmla z16.h, p3/M, z4.h, z10.h\n"
+    "fmax z31.h, p3/M, z31.h, z15.h\n"
+    "fmax z30.h, p3/M, z30.h, z15.h\n"
+    "fmax z29.h, p3/M, z29.h, z15.h\n"
+    "fmax z28.h, p3/M, z28.h, z15.h\n"
+    "fmin z31.h, p3/M, z31.h, z14.h\n"
+    "st1h { z31.h }, p0, [x22, x7, LSL #1]\n"
+    "fmin z30.h, p3/M, z30.h, z14.h\n"
+    "fmin z29.h, p3/M, z29.h, z14.h\n"
+    "ldr x22, [x2, #0x20]\n"
+    "fmin z28.h, p3/M, z28.h, z14.h\n"
+    "st1h { z30.h }, p0, [x21, x7, LSL #1]\n"
+    "fmax z27.h, p3/M, z27.h, z15.h\n"
+    "fmax z26.h, p3/M, z26.h, z15.h\n"
+    "st1h { z29.h }, p0, [x20, x7, LSL #1]\n"
+    "fmax z25.h, p3/M, z25.h, z15.h\n"
+    "st1h { z28.h }, p0, [x19, x7, LSL #1]\n"
+    "fmax z24.h, p3/M, z24.h, z15.h\n"
+    "ldr x21, [x2, #0x28]\n"
+    "fmax z23.h, p3/M, z23.h, z15.h\n"
+    "ldr x20, [x2, #0x30]\n"
+    "fmin z27.h, p3/M, z27.h, z14.h\n"
+    "ldr x19, [x2, #0x38]\n"
+    "fmin z26.h, p3/M, z26.h, z14.h\n"
+    "st1h { z27.h }, p0, [x22, x7, LSL #1]\n"
+    "fmin z25.h, p3/M, z25.h, z14.h\n"
+    "fmin z24.h, p3/M, z24.h, z14.h\n"
+    "st1h { z26.h }, p0, [x21, x7, LSL #1]\n"
+    "fmin z23.h, p3/M, z23.h, z14.h\n"
+    "ldr x22, [x2, #0x40]\n"
+    "fmax z22.h, p3/M, z22.h, z15.h\n"
+    "ldr x21, [x2, #0x48]\n"
+    "fmax z21.h, p3/M, z21.h, z15.h\n"
+    "st1h { z25.h }, p0, [x20, x7, LSL #1]\n"
+    "fmax z20.h, p3/M, z20.h, z15.h\n"
+    "st1h { z24.h }, p0, [x19, x7, LSL #1]\n"
+    "fmax z19.h, p3/M, z19.h, z15.h\n"
+    "st1h { z23.h }, p0, [x22, x7, LSL #1]\n"
+    "fmin z22.h, p3/M, z22.h, z14.h\n"
+    "ldr x20, [x2, #0x50]\n"
+    "fmin z21.h, p3/M, z21.h, z14.h\n"
+    "ldr x19, [x2, #0x58]\n"
+    "fmin z20.h, p3/M, z20.h, z14.h\n"
+    "ldr x22, [x2, #0x60]\n"
+    "fmin z19.h, p3/M, z19.h, z14.h\n"
+    "st1h { z22.h }, p0, [x21, x7, LSL #1]\n"
+    "fmax z18.h, p3/M, z18.h, z15.h\n"
+    "st1h { z21.h }, p0, [x20, x7, LSL #1]\n"
+    "fmax z17.h, p3/M, z17.h, z15.h\n"
+    "st1h { z20.h }, p0, [x19, x7, LSL #1]\n"
+    "fmax z16.h, p3/M, z16.h, z15.h\n"
+    "st1h { z19.h }, p0, [x22, x7, LSL #1]\n"
+    "ldr x21, [x2, #0x68]\n"
+    "fmin z18.h, p3/M, z18.h, z14.h\n"
+    "ldr x20, [x2, #0x70]\n"
+    "fmin z17.h, p3/M, z17.h, z14.h\n"
+    "ldr x19, [x2, #0x78]\n"
+    "fmin z16.h, p3/M, z16.h, z14.h\n"
+    "st1h { z18.h }, p0, [x21, x7, LSL #1]\n"
+    "st1h { z17.h }, p0, [x20, x7, LSL #1]\n"
+    "st1h { z16.h }, p0, [x19, x7, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..98f50f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..e620604
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x7, #0x0\n"
+    "mov x8, #0x0\n"
+    "1:"  // Tile loop
+    "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x23, #0x4\n"
+    "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x17, #0x2\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mov x15, #0x0\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cnth x14\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "sub x12, XZR, x14\n"
+    "ldr x21, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x8, x13, x19\n" // offset += tile_j * ld_input_col
+    "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x23\n" // offset *= kernel_stride * output_size
+    "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x21, x21, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x9, x21, x22, LSL #1\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x28, x9, x22, LSL #1\n"
+    "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias
+    "mov z31.d, z17.d\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias
+    "add x27, x28, x22, LSL #1\n"
+    "mov z30.d, z17.d\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias
+    "add x26, x27, x22, LSL #1\n"
+    "mov z29.d, z17.d\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias
+    "add x25, x13, x13\n"
+    "mov z28.d, z17.d\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias
+    "add x24, x25, x13\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias
+    "add x23, x24, x13\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias
+    "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias
+    "madd x19, x8, x11, x19\n" // offset += tile_j * ld_output_col
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z9.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (2, 2)
+    "ld1h { z10.h }, p2/Z, [x21]\n" // Load input point (0, 0)
+    "mul x19, x19, x17\n" // offset *= output_tile_size
+    "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n" // Load input point (0, 1)
+    "add x10, x10, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "ld1h { z12.h }, p2/Z, [x21, x24, LSL #1]\n" // Load input point (0, 3)
+    "add x22, x10, x20, LSL #1\n"
+    "ld1h { z13.h }, p2/Z, [x21, x23, LSL #1]\n" // Load input point (0, 4)
+    "addvl x16, x16, #16\n"
+    "ld1h { z14.h }, p2/Z, [x9]\n" // Load input point (1, 0)
+    "cmp x14, %x[n_channels]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x16, x16, #-6\n"
+    "ld1h { z15.h }, p2/Z, [x9, x13, LSL #1]\n" // Load input point (1, 1)
+    "ld1h { z16.h }, p2/Z, [x21, x25, LSL #1]\n" // Load input point (0, 2)
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias
+    "whilelt p1.h, x14, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "inch x12\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "inch x15\n"
+    "addvl x21, x21, #1\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x21]\n" // Load input point (0, 0)
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (1, 4)
+    "inch x14\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (1, 3)
+    "fmla z30.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (1, 2)
+    "addvl x9, x9, #1\n"
+    "fmla z31.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x27]\n" // Load input point (3, 0)
+    "fmla z30.h, p3/M, z0.h, z16.h\n"
+    "fmla z29.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (3, 4)
+    "fmla z31.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x28]\n" // Load input point (2, 0)
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n" // Load input point (3, 1)
+    "fmla z29.h, p3/M, z0.h, z15.h\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x28, x13, LSL #1]\n" // Load input point (2, 1)
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (2, 3)
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (2, 4)
+    "addvl x28, x28, #1\n"
+    "fmla z31.h, p3/M, z5.h, z13.h\n"
+    "ld1h { z9.h }, p1/Z, [x28, x25, LSL #1]\n" // Load input point (2, 2)
+    "fmla z30.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (3, 3)
+    "fmla z29.h, p3/M, z1.h, z16.h\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x26]\n" // Load input point (4, 0)
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n" // Load input point (4, 1)
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias
+    "fmla z29.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x26, x25, LSL #1]\n" // Load input point (4, 2)
+    "fmla z31.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (3, 2)
+    "addvl x27, x27, #1\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x21, x24, LSL #1]\n" // Load input point (0, 3)
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x21, x23, LSL #1]\n" // Load input point (0, 4)
+    "fmax z31.h, p3/M, z31.h, z19.h\n"
+    "fmla z28.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (4, 3)
+    "fmax z30.h, p3/M, z30.h, z19.h\n"
+    "fmla z29.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias
+    "fmin z31.h, p3/M, z31.h, z18.h\n"
+    "st1h { z31.h }, p0, [x10]\n" // Store output point (0, 0)
+    "mov z31.d, z17.d\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (4, 4)
+    "whilelt p2.h, x15, %x[n_channels]\n"
+    "fmla z29.h, p3/M, z8.h, z15.h\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias
+    "addvl x26, x26, #1\n"
+    "fmin z30.h, p3/M, z30.h, z18.h\n"
+    "st1h { z30.h }, p0, [x10, x11, LSL #1]\n" // Store output point (0, 1)
+    "mov z30.d, z17.d\n"
+    "addvl x10, x10, #1\n"
+    "fmla z28.h, p3/M, z3.h, z16.h\n"
+    "ld1h { z16.h }, p1/Z, [x21, x25, LSL #1]\n" // Load input point (0, 2)
+    "cmp x14, %x[n_channels]\n"
+    "fmax z29.h, p3/M, z29.h, z19.h\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z7.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x9]\n" // Load input point (1, 0)
+    "fmin z29.h, p3/M, z29.h, z18.h\n"
+    "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+    "mov z29.d, z17.d\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p1/Z, [x9, x13, LSL #1]\n" // Load input point (1, 1)
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x21, x13, LSL #1]\n" // Load input point (0, 1)
+    "addvl x16, x16, #16\n"
+    "fmax z28.h, p3/M, z28.h, z19.h\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias
+    "fmin z28.h, p3/M, z28.h, z18.h\n"
+    "st1h { z28.h }, p0, [x22, x11, LSL #1]\n" // Store output point (1, 1)
+    "mov z28.d, z17.d\n"
+    "addvl x22, x22, #1\n"
+    "addvl x16, x16, #-6\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x21, x7, #0x1\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x8, x8, #0x1\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "cmp x8, x19\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (1, 4)
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (1, 3)
+    "csel x8, x8, XZR, LT\n"
+    "fmla z30.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (1, 2)
+    "csel x7, x7, x21, LT\n"
+    "fmla z31.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x27]\n" // Load input point (3, 0)
+    "cmp x7, x20\n"
+    "fmla z30.h, p3/M, z0.h, z16.h\n"
+    "fmla z29.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (3, 4)
+    "fmla z31.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x28]\n" // Load input point (2, 0)
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n" // Load input point (3, 1)
+    "fmla z29.h, p3/M, z0.h, z15.h\n"
+    "fmla z31.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x28, x13, LSL #1]\n" // Load input point (2, 1)
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (2, 3)
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (2, 4)
+    "fmla z31.h, p3/M, z5.h, z13.h\n"
+    "fmla z30.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (3, 3)
+    "fmla z29.h, p3/M, z1.h, z16.h\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x26]\n" // Load input point (4, 0)
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n" // Load input point (4, 1)
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "fmla z29.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x26, x25, LSL #1]\n" // Load input point (4, 2)
+    "fmla z31.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (3, 2)
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "fmax z31.h, p3/M, z31.h, z19.h\n"
+    "fmla z28.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (4, 3)
+    "fmax z30.h, p3/M, z30.h, z19.h\n"
+    "fmla z29.h, p3/M, z5.h, z16.h\n"
+    "fmin z31.h, p3/M, z31.h, z18.h\n"
+    "st1h { z31.h }, p0, [x10]\n" // Store output point (0, 0)
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "fmla z29.h, p3/M, z8.h, z15.h\n"
+    "ld1h { z11.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (4, 4)
+    "fmin z30.h, p3/M, z30.h, z18.h\n"
+    "st1h { z30.h }, p0, [x10, x11, LSL #1]\n" // Store output point (0, 1)
+    "fmla z28.h, p3/M, z3.h, z16.h\n"
+    "fmax z29.h, p3/M, z29.h, z19.h\n"
+    "fmla z28.h, p3/M, z7.h, z14.h\n"
+    "fmin z29.h, p3/M, z29.h, z18.h\n"
+    "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmax z28.h, p3/M, z28.h, z19.h\n"
+    "fmin z28.h, p3/M, z28.h, z18.h\n"
+    "st1h { z28.h }, p0, [x22, x11, LSL #1]\n" // Store output point (1, 1)
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..3ed743e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[25];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mov x14, #0x0\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cnth x13\n"
+    "ldp x12, x11, [x19, #0x0]\n"
+    "sub x10, XZR, x13\n"
+    "ldp x9, x28, [x19, #0x10]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias
+    "mov z31.d, z17.d\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias
+    "cmp x13, %x[n_channels]\n"
+    "mov z30.d, z17.d\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias
+    "mov z29.d, z17.d\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias
+    "mov z28.d, z17.d\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x16, x16, #16\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x16, x16, #-6\n"
+    "ld1h { z9.h }, p2/Z, [x27, x14, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x26, x14, LSL #1]\n"
+    "ldp x25, x23, [x15, #0x10]\n"
+    "ldp x22, x21, [x15, #0x20]\n"
+    "ldp x20, x19, [x15, #0x30]\n"
+    "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x23, x14, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x21, x14, LSL #1]\n"
+    "ld1h { z15.h }, p2/Z, [x20, x14, LSL #1]\n"
+    "ld1h { z16.h }, p2/Z, [x19, x14, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x24, [x15, #0x40]\n"
+    "whilelt p1.h, x13, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ldr x20, [x15, #0x48]\n"
+    "inch x10\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "ldr x23, [x15, #0x50]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ldr x19, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x60]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x14, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla z30.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x23, x14, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x19, x14, LSL #1]\n"
+    "ldr x19, [x15, #0x78]\n"
+    "fmla z30.h, p3/M, z0.h, z16.h\n"
+    "ldr x27, [x15, #0x80]\n"
+    "fmla z29.h, p3/M, z3.h, z14.h\n"
+    "ldr x26, [x15, #0x88]\n"
+    "ldr x25, [x15, #0x90]\n"
+    "fmla z31.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x22, x14, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x14, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z15.h\n"
+    "ld1h { z14.h }, p2/Z, [x26, x14, LSL #1]\n"
+    "ldr x23, [x15, #0x98]\n"
+    "fmla z31.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x23, x14, LSL #1]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "fmla z31.h, p3/M, z5.h, z13.h\n"
+    "ldr x21, [x15, #0xa8]\n"
+    "fmla z30.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x19, x14, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z16.h\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "ldr x19, [x15, #0xb8]\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z15.h }, p2/Z, [x25, x14, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n"
+    "ldr x24, [x15, #0xc0]\n"
+    "fmla z31.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "ldp x27, x26, [x15, #0x0]\n"
+    "fmla z29.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x19, x14, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "ldp x25, x23, [x15, #0x10]\n"
+    "ldp x22, x21, [x15, #0x20]\n"
+    "fmla z28.h, p3/M, z5.h, z14.h\n"
+    "fmax z31.h, p3/M, z31.h, z19.h\n"
+    "ld1h { z14.h }, p2/Z, [x20, x14, LSL #1]\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "ld1h { z9.h }, p1/Z, [x27, x13, LSL #1]\n"
+    "fmax z30.h, p3/M, z30.h, z19.h\n"
+    "ld1h { z10.h }, p1/Z, [x26, x13, LSL #1]\n"
+    "ld1h { z12.h }, p1/Z, [x23, x13, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "fmin z31.h, p3/M, z31.h, z18.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+    "inch x14\n"
+    "fmla z29.h, p3/M, z5.h, z16.h\n"
+    "ld1h { z13.h }, p1/Z, [x22, x13, LSL #1]\n"
+    "whilelt p2.h, x14, %x[n_channels]\n"
+    "fmin z30.h, p3/M, z30.h, z18.h\n"
+    "ldp x20, x19, [x15, #0x30]\n"
+    "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z3.h, z16.h\n"
+    "st1h { z31.h }, p0, [x12, x10, LSL #1]\n"
+    "mov z31.d, z17.d\n"
+    "ld1h { z16.h }, p1/Z, [x19, x13, LSL #1]\n"
+    "fmla z29.h, p3/M, z8.h, z15.h\n"
+    "st1h { z30.h }, p0, [x11, x10, LSL #1]\n"
+    "mov z30.d, z17.d\n"
+    "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z7.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x21, x13, LSL #1]\n"
+    "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias
+    "fmax z29.h, p3/M, z29.h, z19.h\n"
+    "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p1/Z, [x20, x13, LSL #1]\n"
+    "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias
+    "fmin z29.h, p3/M, z29.h, z18.h\n"
+    "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias
+    "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x25, x13, LSL #1]\n"
+    "inch x13\n"
+    "fmax z28.h, p3/M, z28.h, z19.h\n"
+    "st1h { z29.h }, p0, [x9, x10, LSL #1]\n"
+    "cmp x13, %x[n_channels]\n"
+    "mov z29.d, z17.d\n"
+    "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x16, x16, #16\n"
+    "fmin z28.h, p3/M, z28.h, z18.h\n"
+    "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias
+    "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x16, x16, #-6\n"
+    "st1h { z28.h }, p0, [x28, x10, LSL #1]\n"
+    "mov z28.d, z17.d\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.h, p3/M, z8.h, z9.h\n"
+    "ldr x24, [x15, #0x40]\n"
+    "inch x10\n"
+    "fmla z30.h, p3/M, z6.h, z9.h\n"
+    "ldr x20, [x15, #0x48]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "ldr x23, [x15, #0x50]\n"
+    "fmla z28.h, p3/M, z0.h, z9.h\n"
+    "ldr x19, [x15, #0x58]\n"
+    "ldr x22, [x15, #0x60]\n"
+    "fmla z31.h, p3/M, z0.h, z10.h\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x14, LSL #1]\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla z30.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x23, x14, LSL #1]\n"
+    "fmla z31.h, p3/M, z3.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x19, x14, LSL #1]\n"
+    "ldr x19, [x15, #0x78]\n"
+    "fmla z30.h, p3/M, z0.h, z16.h\n"
+    "ldr x27, [x15, #0x80]\n"
+    "fmla z29.h, p3/M, z3.h, z14.h\n"
+    "ldr x26, [x15, #0x88]\n"
+    "ldr x25, [x15, #0x90]\n"
+    "fmla z31.h, p3/M, z4.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x22, x14, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x14, LSL #1]\n"
+    "fmla z29.h, p3/M, z0.h, z15.h\n"
+    "ld1h { z14.h }, p2/Z, [x26, x14, LSL #1]\n"
+    "ldr x23, [x15, #0x98]\n"
+    "fmla z31.h, p3/M, z2.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n"
+    "fmla z30.h, p3/M, z5.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x23, x14, LSL #1]\n"
+    "ldr x22, [x15, #0xa0]\n"
+    "fmla z31.h, p3/M, z5.h, z13.h\n"
+    "ldr x21, [x15, #0xa8]\n"
+    "fmla z30.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x19, x14, LSL #1]\n"
+    "fmla z29.h, p3/M, z1.h, z16.h\n"
+    "ldr x20, [x15, #0xb0]\n"
+    "ldr x19, [x15, #0xb8]\n"
+    "fmla z31.h, p3/M, z6.h, z15.h\n"
+    "fmla z28.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z15.h }, p2/Z, [x25, x14, LSL #1]\n"
+    "fmla z30.h, p3/M, z7.h, z12.h\n"
+    "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n"
+    "ldr x24, [x15, #0xc0]\n"
+    "fmla z31.h, p3/M, z7.h, z16.h\n"
+    "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z6.h, z15.h\n"
+    "ld1h { z15.h }, p2/Z, [x19, x14, LSL #1]\n"
+    "fmla z30.h, p3/M, z8.h, z11.h\n"
+    "fmla z28.h, p3/M, z5.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x20, x14, LSL #1]\n"
+    "fmax z31.h, p3/M, z31.h, z19.h\n"
+    "fmla z29.h, p3/M, z7.h, z13.h\n"
+    "fmax z30.h, p3/M, z30.h, z19.h\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+    "fmin z31.h, p3/M, z31.h, z18.h\n"
+    "st1h { z31.h }, p0, [x12, x10, LSL #1]\n"
+    "fmla z29.h, p3/M, z5.h, z16.h\n"
+    "fmla z28.h, p3/M, z3.h, z16.h\n"
+    "fmin z30.h, p3/M, z30.h, z18.h\n"
+    "st1h { z30.h }, p0, [x11, x10, LSL #1]\n"
+    "fmla z28.h, p3/M, z7.h, z14.h\n"
+    "fmla z29.h, p3/M, z8.h, z15.h\n"
+    "fmla z28.h, p3/M, z6.h, z15.h\n"
+    "fmax z29.h, p3/M, z29.h, z19.h\n"
+    "fmla z28.h, p3/M, z8.h, z11.h\n"
+    "fmin z29.h, p3/M, z29.h, z18.h\n"
+    "st1h { z29.h }, p0, [x9, x10, LSL #1]\n"
+    "fmax z28.h, p3/M, z28.h, z19.h\n"
+    "fmin z28.h, p3/M, z28.h, z18.h\n"
+    "st1h { z28.h }, p0, [x28, x10, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..20f3ee0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef __fp16 bias_type;
+  typedef __fp16 input_type;
+  typedef __fp16 weight_type;
+  typedef __fp16 return_type;
+
+  typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  indirect_kern_type indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..f1ee5c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const __fp16 *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  __fp16 *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const __fp16 *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    __fp16 *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const __fp16 min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const __fp16 *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      __fp16 *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x5, #0x0\n"
+    "mov x6, #0x0\n"
+    "1:"  // Tile loop
+    "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x20, #0x2\n"
+    "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x7, #0x2\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mov x17, #0x0\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cnth x16\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "sub x14, XZR, x16\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x5, x22\n" // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x6, x15, x19\n" // offset += tile_j * ld_input_col
+    "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x20\n" // offset *= kernel_stride * output_size
+    "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x13, x13, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x20, x13, x22, LSL #1\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x10, x20, x22, LSL #1\n"
+    "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias
+    "mov z31.d, z16.d\n"
+    "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+    "add x9, x10, x22, LSL #1\n"
+    "mov z30.d, z16.d\n"
+    "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+    "add x28, x9, x22, LSL #1\n"
+    "mov z29.d, z16.d\n"
+    "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+    "add x27, x28, x22, LSL #1\n"
+    "mov z28.d, z16.d\n"
+    "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+    "add x26, x15, x15\n"
+    "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+    "add x25, x26, x15\n"
+    "mul x19, x5, x21\n" // offset = tile_i * ld_output_row
+    "add x24, x25, x15\n"
+    "add x23, x24, x15\n"
+    "madd x19, x6, x12, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x7\n" // offset *= output_tile_size
+    "add x11, x11, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+    "add x22, x11, x21, LSL #1\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z5.h }, p2/Z, [x13]\n" // Load input point (0, 0)
+    "ld1h { z6.h }, p2/Z, [x13, x15, LSL #1]\n" // Load input point (0, 1)
+    "cmp x16, %x[n_channels]\n"
+    "ld1h { z7.h }, p2/Z, [x20]\n" // Load input point (1, 0)
+    "addvl x8, x8, #6\n"
+    "ld1h { z8.h }, p2/Z, [x20, x15, LSL #1]\n" // Load input point (1, 1)
+    "ld1h { z9.h }, p2/Z, [x13, x26, LSL #1]\n" // Load input point (0, 2)
+    "ld1h { z13.h }, p2/Z, [x20, x26, LSL #1]\n" // Load input point (1, 2)
+    "ld1h { z11.h }, p2/Z, [x13, x25, LSL #1]\n" // Load input point (0, 3)
+    "ld1h { z12.h }, p2/Z, [x13, x24, LSL #1]\n" // Load input point (0, 4)
+    "ld1h { z10.h }, p2/Z, [x20, x23, LSL #1]\n" // Load input point (1, 5)
+    "ld1h { z14.h }, p2/Z, [x10]\n" // Load input point (2, 0)
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x20, x25, LSL #1]\n" // Load input point (1, 3)
+    "whilelt p1.h, x16, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z0.h, z6.h\n"
+    "inch x14\n"
+    "fmla z29.h, p3/M, z0.h, z7.h\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z0.h }, p3/Z, [x8]\n" // Load from weights and bias
+    "inch x17\n"
+    "fmla z31.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x20, x24, LSL #1]\n" // Load input point (1, 4)
+    "addvl x20, x20, #1\n"
+    "fmla z30.h, p3/M, z1.h, z9.h\n"
+    "inch x16\n"
+    "fmla z29.h, p3/M, z1.h, z8.h\n"
+    "fmla z28.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x23, LSL #1]\n" // Load input point (0, 5)
+    "addvl x13, x13, #1\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "fmla z28.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1)
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z5.h\n"
+    "fmla z28.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 2)
+    "fmla z30.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 3)
+    "fmla z29.h, p3/M, z4.h, z6.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z7.h\n"
+    "ld1h { z7.h }, p1/Z, [x20]\n" // Load input point (1, 0)
+    "fmla z30.h, p3/M, z0.h, z8.h\n"
+    "fmla z29.h, p3/M, z0.h, z14.h\n"
+    "fmla z28.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (2, 5)
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (2, 4)
+    "addvl x10, x10, #1\n"
+    "fmla z30.h, p3/M, z2.h, z5.h\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x8, x8, #16\n"
+    "fmla z31.h, p3/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x9]\n" // Load input point (3, 0)
+    "ld1h { z16.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z30.h, p3/M, z3.h, z6.h\n"
+    "fmla z29.h, p3/M, z3.h, z9.h\n"
+    "fmla z28.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1)
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 2)
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "fmla z28.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (3, 5)
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z5.h\n"
+    "fmla z28.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p3/Z, [x8, #-6, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 3)
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z1.h, z6.h\n"
+    "fmla z28.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p3/Z, [x8, #-5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (3, 4)
+    "addvl x9, x9, #1\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #-4, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x28]\n" // Load input point (4, 0)
+    "fmla z30.h, p3/M, z3.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #-3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1)
+    "fmla z30.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (4, 4)
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #-2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 2)
+    "fmla z30.h, p3/M, z0.h, z6.h\n"
+    "fmla z29.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p3/Z, [x8, #-1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 3)
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p3/Z, [x8]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (4, 5)
+    "addvl x28, x28, #1\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z29.h, p3/M, z2.h, z5.h\n"
+    "fmla z28.h, p3/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (5, 0)
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "fmla z28.h, p3/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n" // Load input point (5, 1)
+    "fmla z30.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z14.h }, p1/Z, [x10]\n" // Load input point (2, 0)
+    "fmla z29.h, p3/M, z4.h, z8.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x27, x26, LSL #1]\n" // Load input point (5, 2)
+    "fmla z30.h, p3/M, z0.h, z13.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (5, 3)
+    "fmla z28.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z13.h }, p1/Z, [x20, x26, LSL #1]\n" // Load input point (1, 2)
+    "fmla z30.h, p3/M, z1.h, z5.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (5, 4)
+    "fmla z28.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z5.h }, p1/Z, [x13]\n" // Load input point (0, 0)
+    "fmla z30.h, p3/M, z2.h, z6.h\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (5, 5)
+    "whilelt p2.h, x17, %x[n_channels]\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x27, x27, #1\n"
+    "fmla z31.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z6.h }, p1/Z, [x13, x15, LSL #1]\n" // Load input point (0, 1)
+    "addvl x8, x8, #16\n"
+    "fmla z30.h, p3/M, z3.h, z8.h\n"
+    "cmp x16, %x[n_channels]\n"
+    "fmla z29.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p1/Z, [x13, x25, LSL #1]\n" // Load input point (0, 3)
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p1/Z, [x20, x15, LSL #1]\n" // Load input point (1, 1)
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p1/Z, [x20, x23, LSL #1]\n" // Load input point (1, 5)
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x13, x24, LSL #1]\n" // Load input point (0, 4)
+    "fmla z28.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x13, x26, LSL #1]\n" // Load input point (0, 2)
+    "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "addvl x8, x8, #-6\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x11]\n" // Store output point (0, 0)
+    "mov z31.d, z16.d\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "st1h { z30.h }, p0, [x11, x12, LSL #1]\n" // Store output point (0, 1)
+    "mov z30.d, z16.d\n"
+    "addvl x11, x11, #1\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+    "mov z29.d, z16.d\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z28.h }, p0, [x22, x12, LSL #1]\n" // Store output point (1, 1)
+    "mov z28.d, z16.d\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x20, x25, LSL #1]\n" // Load input point (1, 3)
+    "mov p0.b, p2.b\n"
+    "fmla z30.h, p3/M, z0.h, z6.h\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x5, #0x1\n"
+    "fmla z29.h, p3/M, z0.h, z7.h\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "fmla z28.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z0.h }, p3/Z, [x8]\n" // Load from weights and bias
+    "add x6, x6, #0x1\n"
+    "fmla z31.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x20, x24, LSL #1]\n" // Load input point (1, 4)
+    "fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z29.h, p3/M, z1.h, z8.h\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x6, x19\n"
+    "fmla z28.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x13, x23, LSL #1]\n" // Load input point (0, 5)
+    "csel x6, x6, XZR, LT\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "csel x5, x5, x21, LT\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "cmp x5, x20\n"
+    "fmla z28.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1)
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z5.h\n"
+    "fmla z28.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 2)
+    "fmla z30.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 3)
+    "fmla z29.h, p3/M, z4.h, z6.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z7.h\n"
+    "fmla z30.h, p3/M, z0.h, z8.h\n"
+    "fmla z29.h, p3/M, z0.h, z14.h\n"
+    "fmla z28.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (2, 5)
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (2, 4)
+    "fmla z30.h, p3/M, z2.h, z5.h\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x8, x8, #16\n"
+    "fmla z31.h, p3/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x9]\n" // Load input point (3, 0)
+    "fmla z30.h, p3/M, z3.h, z6.h\n"
+    "fmla z29.h, p3/M, z3.h, z9.h\n"
+    "fmla z28.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1)
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 2)
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "fmla z28.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (3, 5)
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "fmla z29.h, p3/M, z0.h, z5.h\n"
+    "fmla z28.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p3/Z, [x8, #-6, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 3)
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "fmla z29.h, p3/M, z1.h, z6.h\n"
+    "fmla z28.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p3/Z, [x8, #-5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (3, 4)
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #-4, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x28]\n" // Load input point (4, 0)
+    "fmla z30.h, p3/M, z3.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #-3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1)
+    "fmla z30.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (4, 4)
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #-2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 2)
+    "fmla z30.h, p3/M, z0.h, z6.h\n"
+    "fmla z29.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p3/Z, [x8, #-1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 3)
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p3/Z, [x8]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (4, 5)
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z29.h, p3/M, z2.h, z5.h\n"
+    "fmla z28.h, p3/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (5, 0)
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "fmla z28.h, p3/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n" // Load input point (5, 1)
+    "fmla z30.h, p3/M, z4.h, z14.h\n"
+    "fmla z29.h, p3/M, z4.h, z8.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x27, x26, LSL #1]\n" // Load input point (5, 2)
+    "fmla z30.h, p3/M, z0.h, z13.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (5, 3)
+    "fmla z28.h, p3/M, z0.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z1.h, z5.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (5, 4)
+    "fmla z28.h, p3/M, z1.h, z9.h\n"
+    "fmla z31.h, p3/M, z2.h, z5.h\n"
+    "fmla z30.h, p3/M, z2.h, z6.h\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (5, 5)
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z6.h\n"
+    "fmla z30.h, p3/M, z3.h, z8.h\n"
+    "fmla z29.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z8.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z9.h\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x11]\n" // Store output point (0, 0)
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z30.h }, p0, [x11, x12, LSL #1]\n" // Store output point (0, 1)
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0)
+    "st1h { z28.h }, p0, [x22, x12, LSL #1]\n" // Store output point (1, 1)
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..caa15a9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const __fp16 *const *const input_ptrs,
+  __fp16 *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const __fp16 activation_min,
+  const __fp16 activation_max
+)
+{
+  struct Args
+  {
+    __fp16 *const *outptrs;
+    const void *params;
+    const __fp16 min, max;
+    const __fp16 *inptrs[36];
+
+    Args(
+      const __fp16 *const *const input_ptrs,
+      __fp16 *const *const outptrs,
+      const void *const params,
+      const __fp16 min,
+      const __fp16 max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x6, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "mov x7, #0x0\n"
+    "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "cnth x8\n"
+    "ldp x17, x16, [x19, #0x0]\n"
+    "sub x15, XZR, x8\n"
+    "ldp x14, x13, [x19, #0x10]\n"
+    "whilelt p2.h, XZR, %x[n_channels]\n"
+    "ld1h { z16.h }, p3/Z, [x5]\n" // Load from weights and bias
+    "mov z31.d, z16.d\n"
+    "ld1h { z0.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+    "cmp x8, %x[n_channels]\n"
+    "mov z30.d, z16.d\n"
+    "ld1h { z1.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+    "mov z29.d, z16.d\n"
+    "ld1h { z2.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+    "mov z28.d, z16.d\n"
+    "ld1h { z3.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias
+    "ld1h { z4.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias
+    "addvl x5, x5, #6\n"
+    "ldp x12, x11, [x6, #0x0]\n"
+    "ldp x10, x9, [x6, #0x10]\n"
+    "ldp x20, x28, [x6, #0x20]\n"
+    "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "ld1h { z6.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "ld1h { z7.h }, p2/Z, [x10, x7, LSL #1]\n"
+    "ld1h { z8.h }, p2/Z, [x9, x7, LSL #1]\n"
+    "ld1h { z9.h }, p2/Z, [x20, x7, LSL #1]\n"
+    "ld1h { z13.h }, p2/Z, [x28, x7, LSL #1]\n"
+    "ldp x27, x19, [x6, #0x30]\n"
+    "ldp x26, x25, [x6, #0x40]\n"
+    "ld1h { z11.h }, p2/Z, [x27, x7, LSL #1]\n"
+    "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n"
+    "ld1h { z10.h }, p2/Z, [x26, x7, LSL #1]\n"
+    "ld1h { z14.h }, p2/Z, [x25, x7, LSL #1]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.h, p3/M, z0.h, z5.h\n"
+    "ldr x24, [x6, #0x50]\n"
+    "whilelt p1.h, x8, %x[n_channels]\n"
+    "fmla z30.h, p3/M, z0.h, z6.h\n"
+    "ldr x23, [x6, #0x58]\n"
+    "inch x15\n"
+    "fmla z29.h, p3/M, z0.h, z7.h\n"
+    "ldr x22, [x6, #0x60]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x5]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x21, [x6, #0x68]\n"
+    "fmla z29.h, p3/M, z1.h, z8.h\n"
+    "ldr x20, [x6, #0x70]\n"
+    "fmla z28.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x22, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "ldr x19, [x6, #0x78]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "ldr x12, [x6, #0x80]\n"
+    "fmla z28.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "ldr x11, [x6, #0x88]\n"
+    "fmla z29.h, p3/M, z3.h, z5.h\n"
+    "ldr x10, [x6, #0x90]\n"
+    "fmla z28.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x19, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z6.h\n"
+    "ldr x9, [x6, #0x98]\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z7.h\n"
+    "ldr x20, [x6, #0xa0]\n"
+    "fmla z30.h, p3/M, z0.h, z8.h\n"
+    "ldr x28, [x6, #0xa8]\n"
+    "fmla z29.h, p3/M, z0.h, z14.h\n"
+    "ldr x27, [x6, #0xb0]\n"
+    "fmla z28.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "ldr x19, [x6, #0xb8]\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "ldr x26, [x6, #0xc0]\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z5.h\n"
+    "ldr x25, [x6, #0xc8]\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "ldr x24, [x6, #0xd0]\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z5.h\n"
+    "addvl x5, x5, #16\n"
+    "fmla z30.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z5.h }, p2/Z, [x10, x7, LSL #1]\n"
+    "ldr x23, [x6, #0xd8]\n"
+    "fmla z29.h, p3/M, z3.h, z9.h\n"
+    "ldr x22, [x6, #0xe0]\n"
+    "fmla z28.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x9, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x20, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "ldr x21, [x6, #0xe8]\n"
+    "fmla z28.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x19, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "ldr x20, [x6, #0xf0]\n"
+    "fmla z29.h, p3/M, z0.h, z5.h\n"
+    "ldr x19, [x6, #0xf8]\n"
+    "fmla z28.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p3/Z, [x5, #-6, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ldr x12, [x6, #0x100]\n"
+    "fmla z29.h, p3/M, z1.h, z6.h\n"
+    "ldr x11, [x6, #0x108]\n"
+    "fmla z28.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p3/Z, [x5, #-5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "ldr x10, [x6, #0x110]\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "ldr x9, [x6, #0x118]\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #-4, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z16.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias
+    "fmla z29.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #-3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x22, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #-2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z6.h\n"
+    "fmla z29.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p3/Z, [x5, #-1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p3/Z, [x5]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x21, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z29.h, p3/M, z2.h, z5.h\n"
+    "fmla z28.h, p3/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x20, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "fmla z28.h, p3/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z14.h\n"
+    "fmla z29.h, p3/M, z4.h, z8.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z13.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "ldp x12, x11, [x6, #0x0]\n"
+    "fmla z28.h, p3/M, z0.h, z12.h\n"
+    "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z1.h, z5.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x10, x7, LSL #1]\n"
+    "fmla z28.h, p3/M, z1.h, z9.h\n"
+    "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z5.h }, p1/Z, [x12, x8, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z6.h\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x9, x7, LSL #1]\n"
+    "inch x7\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ldp x10, x9, [x6, #0x10]\n"
+    "whilelt p2.h, x7, %x[n_channels]\n"
+    "fmla z31.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z6.h }, p1/Z, [x11, x8, LSL #1]\n"
+    "ldp x20, x28, [x6, #0x20]\n"
+    "fmla z30.h, p3/M, z3.h, z8.h\n"
+    "ldp x27, x19, [x6, #0x30]\n"
+    "fmla z29.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z7.h }, p1/Z, [x10, x8, LSL #1]\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z13.h }, p1/Z, [x28, x8, LSL #1]\n"
+    "fmla z31.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p1/Z, [x9, x8, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z11.h }, p1/Z, [x27, x8, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p1/Z, [x19, x8, LSL #1]\n"
+    "fmla z28.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p1/Z, [x20, x8, LSL #1]\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "ldp x26, x25, [x6, #0x40]\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "addvl x5, x5, #16\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "ld1h { z10.h }, p1/Z, [x26, x8, LSL #1]\n"
+    "ld1h { z14.h }, p1/Z, [x25, x8, LSL #1]\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "inch x8\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias
+    "cmp x8, %x[n_channels]\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias
+    "addvl x5, x5, #-6\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z31.h }, p0, [x17, x15, LSL #1]\n"
+    "mov z31.d, z16.d\n"
+    "st1h { z30.h }, p0, [x16, x15, LSL #1]\n"
+    "mov z30.d, z16.d\n"
+    "st1h { z29.h }, p0, [x14, x15, LSL #1]\n"
+    "mov z29.d, z16.d\n"
+    "st1h { z28.h }, p0, [x13, x15, LSL #1]\n"
+    "mov z28.d, z16.d\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.h, p3/M, z0.h, z5.h\n"
+    "ldr x24, [x6, #0x50]\n"
+    "inch x15\n"
+    "fmla z30.h, p3/M, z0.h, z6.h\n"
+    "ldr x23, [x6, #0x58]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.h, p3/M, z0.h, z7.h\n"
+    "ldr x22, [x6, #0x60]\n"
+    "fmla z28.h, p3/M, z0.h, z8.h\n"
+    "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n"
+    "ld1h { z0.h }, p3/Z, [x5]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z1.h, z9.h\n"
+    "ldr x21, [x6, #0x68]\n"
+    "fmla z29.h, p3/M, z1.h, z8.h\n"
+    "fmla z28.h, p3/M, z1.h, z13.h\n"
+    "ld1h { z1.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+    "ldr x20, [x6, #0x70]\n"
+    "fmla z31.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x22, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "ldr x19, [x6, #0x78]\n"
+    "fmla z29.h, p3/M, z2.h, z13.h\n"
+    "fmla z28.h, p3/M, z2.h, z5.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+    "ldr x12, [x6, #0x80]\n"
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x21, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "ldr x11, [x6, #0x88]\n"
+    "fmla z29.h, p3/M, z3.h, z5.h\n"
+    "fmla z28.h, p3/M, z3.h, z6.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+    "ldr x10, [x6, #0x90]\n"
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x20, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x19, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z6.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias
+    "ldr x9, [x6, #0x98]\n"
+    "fmla z31.h, p3/M, z0.h, z7.h\n"
+    "ldr x20, [x6, #0xa0]\n"
+    "fmla z30.h, p3/M, z0.h, z8.h\n"
+    "ldr x28, [x6, #0xa8]\n"
+    "fmla z29.h, p3/M, z0.h, z14.h\n"
+    "fmla z28.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias
+    "ldr x27, [x6, #0xb0]\n"
+    "fmla z31.h, p3/M, z1.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z1.h, z13.h\n"
+    "ldr x19, [x6, #0xb8]\n"
+    "fmla z29.h, p3/M, z1.h, z11.h\n"
+    "fmla z28.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias
+    "ldr x26, [x6, #0xc0]\n"
+    "fmla z31.h, p3/M, z2.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z5.h\n"
+    "ldr x25, [x6, #0xc8]\n"
+    "fmla z29.h, p3/M, z2.h, z12.h\n"
+    "fmla z28.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias
+    "addvl x5, x5, #16\n"
+    "fmla z31.h, p3/M, z3.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x10, x7, LSL #1]\n"
+    "ldr x24, [x6, #0xd0]\n"
+    "fmla z30.h, p3/M, z3.h, z6.h\n"
+    "ldr x23, [x6, #0xd8]\n"
+    "fmla z29.h, p3/M, z3.h, z9.h\n"
+    "fmla z28.h, p3/M, z3.h, z13.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias
+    "ldr x22, [x6, #0xe0]\n"
+    "fmla z31.h, p3/M, z4.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x9, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x20, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z13.h\n"
+    "fmla z28.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias
+    "ldr x21, [x6, #0xe8]\n"
+    "fmla z31.h, p3/M, z0.h, z14.h\n"
+    "ld1h { z14.h }, p2/Z, [x19, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z11.h\n"
+    "ldr x20, [x6, #0xf0]\n"
+    "fmla z29.h, p3/M, z0.h, z5.h\n"
+    "fmla z28.h, p3/M, z0.h, z6.h\n"
+    "ld1h { z0.h }, p3/Z, [x5, #-6, MUL VL]\n" // Load from weights and bias
+    "ldr x19, [x6, #0xf8]\n"
+    "fmla z31.h, p3/M, z1.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z1.h, z12.h\n"
+    "ldr x12, [x6, #0x100]\n"
+    "fmla z29.h, p3/M, z1.h, z6.h\n"
+    "fmla z28.h, p3/M, z1.h, z10.h\n"
+    "ld1h { z1.h }, p3/Z, [x5, #-5, MUL VL]\n" // Load from weights and bias
+    "ldr x11, [x6, #0x108]\n"
+    "fmla z31.h, p3/M, z2.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x27, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z9.h\n"
+    "ldr x10, [x6, #0x110]\n"
+    "fmla z29.h, p3/M, z2.h, z10.h\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #-4, MUL VL]\n" // Load from weights and bias
+    "ldr x9, [x6, #0x118]\n"
+    "fmla z31.h, p3/M, z3.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x26, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z13.h\n"
+    "fmla z29.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #-3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z13.h\n"
+    "ld1h { z13.h }, p2/Z, [x25, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z8.h\n"
+    "ld1h { z8.h }, p2/Z, [x22, x7, LSL #1]\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z14.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #-2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z5.h\n"
+    "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z6.h\n"
+    "fmla z29.h, p3/M, z0.h, z9.h\n"
+    "fmla z28.h, p3/M, z0.h, z13.h\n"
+    "ld1h { z0.h }, p3/Z, [x5, #-1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z1.h, z6.h\n"
+    "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z1.h, z10.h\n"
+    "fmla z29.h, p3/M, z1.h, z13.h\n"
+    "fmla z28.h, p3/M, z1.h, z5.h\n"
+    "ld1h { z1.h }, p3/Z, [x5]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z2.h, z10.h\n"
+    "ld1h { z10.h }, p2/Z, [x21, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z2.h, z11.h\n"
+    "fmla z29.h, p3/M, z2.h, z5.h\n"
+    "fmla z28.h, p3/M, z2.h, z6.h\n"
+    "ld1h { z2.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z3.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x20, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z3.h, z12.h\n"
+    "fmla z29.h, p3/M, z3.h, z6.h\n"
+    "fmla z28.h, p3/M, z3.h, z8.h\n"
+    "ld1h { z3.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z4.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z4.h, z14.h\n"
+    "fmla z29.h, p3/M, z4.h, z8.h\n"
+    "fmla z28.h, p3/M, z4.h, z10.h\n"
+    "ld1h { z4.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias
+    "fmla z31.h, p3/M, z0.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
+    "fmla z30.h, p3/M, z0.h, z13.h\n"
+    "fmla z29.h, p3/M, z0.h, z11.h\n"
+    "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n"
+    "fmla z28.h, p3/M, z0.h, z12.h\n"
+    "fmla z31.h, p3/M, z1.h, z13.h\n"
+    "fmla z30.h, p3/M, z1.h, z5.h\n"
+    "fmla z29.h, p3/M, z1.h, z12.h\n"
+    "ld1h { z12.h }, p2/Z, [x10, x7, LSL #1]\n"
+    "fmla z28.h, p3/M, z1.h, z9.h\n"
+    "fmla z31.h, p3/M, z2.h, z5.h\n"
+    "fmla z30.h, p3/M, z2.h, z6.h\n"
+    "fmla z29.h, p3/M, z2.h, z9.h\n"
+    "ld1h { z9.h }, p2/Z, [x9, x7, LSL #1]\n"
+    "fmla z28.h, p3/M, z2.h, z11.h\n"
+    "fmla z31.h, p3/M, z3.h, z6.h\n"
+    "fmla z30.h, p3/M, z3.h, z8.h\n"
+    "fmla z29.h, p3/M, z3.h, z11.h\n"
+    "fmla z28.h, p3/M, z3.h, z12.h\n"
+    "fmla z31.h, p3/M, z4.h, z8.h\n"
+    "fmla z30.h, p3/M, z4.h, z10.h\n"
+    "fmla z29.h, p3/M, z4.h, z12.h\n"
+    "fmla z28.h, p3/M, z4.h, z9.h\n"
+    "fmax z31.h, p3/M, z31.h, z18.h\n"
+    "fmax z30.h, p3/M, z30.h, z18.h\n"
+    "fmax z29.h, p3/M, z29.h, z18.h\n"
+    "fmax z28.h, p3/M, z28.h, z18.h\n"
+    "fmin z31.h, p3/M, z31.h, z17.h\n"
+    "st1h { z31.h }, p0, [x17, x15, LSL #1]\n"
+    "fmin z30.h, p3/M, z30.h, z17.h\n"
+    "fmin z29.h, p3/M, z29.h, z17.h\n"
+    "st1h { z30.h }, p0, [x16, x15, LSL #1]\n"
+    "fmin z28.h, p3/M, z28.h, z17.h\n"
+    "st1h { z29.h }, p0, [x14, x15, LSL #1]\n"
+    "st1h { z28.h }, p0, [x13, x15, LSL #1]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..74716dd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+  sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..d443855
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float *const inptrs[16] = {
+    input_ptrs[0], input_ptrs[1], input_ptrs[4], input_ptrs[5], input_ptrs[2], input_ptrs[6], input_ptrs[3], input_ptrs[7], input_ptrs[8], input_ptrs[9], input_ptrs[10], input_ptrs[11], input_ptrs[12], input_ptrs[13], input_ptrs[14], input_ptrs[15],
+  };
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ldp x26, x23, [%x[inptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "ldp x25, x16, [%x[inptrs], #0x10]\n"
+    "mov x15, #0x0\n"
+    "ld1w { z15.s }, p2/Z, [%x[params]]\n"
+    "mov z14.d, z15.d\n"
+    "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "cntw x14\n"
+    "mov z12.d, z15.d\n"
+    "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "sub x13, XZR, x14\n"
+    "mov z10.d, z15.d\n"
+    "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "mov z8.d, z15.d\n"
+    "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "cmp x14, %x[n_channels]\n"
+    "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "ld1w { z3.s }, p1/Z, [x26, x15, LSL #2]\n"
+    "ld1w { z2.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "ld1w { z0.s }, p1/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z31.s }, p1/Z, [x25, x15, LSL #2]\n"
+    "ld1w { z30.s }, p1/Z, [x16, x15, LSL #2]\n"
+    "ldp x24, x12, [%x[inptrs], #0x20]\n"
+    "ldp x23, x11, [%x[inptrs], #0x30]\n"
+    "ldp x10, x9, [%x[inptrs], #0x40]\n"
+    "ld1w { z29.s }, p1/Z, [x24, x15, LSL #2]\n"
+    "ld1w { z28.s }, p1/Z, [x12, x15, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x23, x15, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x11, x15, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x10, x15, LSL #2]\n"
+    "ld1w { z24.s }, p1/Z, [x9, x15, LSL #2]\n"
+    "ldp x28, x27, [%x[inptrs], #0x50]\n"
+    "ldp x26, x25, [%x[inptrs], #0x60]\n"
+    "ldp x24, x23, [%x[inptrs], #0x70]\n"
+    "ld1w { z23.s }, p1/Z, [x28, x15, LSL #2]\n"
+    "ld1w { z22.s }, p1/Z, [x27, x15, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x26, x15, LSL #2]\n"
+    "ld1w { z20.s }, p1/Z, [x25, x15, LSL #2]\n"
+    "ld1w { z19.s }, p1/Z, [x24, x15, LSL #2]\n"
+    "ld1w { z18.s }, p1/Z, [x23, x15, LSL #2]\n"
+    "ldp x22, x21, [%x[outptrs], #0x0]\n"
+    "ldp x20, x19, [%x[outptrs], #0x10]\n"
+    "ld1rw { z17.s }, p2/Z, [%x[minmax_vals]]\n"
+    "ld1rw { z16.s }, p2/Z, [%x[minmax_vals], #4]\n"
+    "bge 1f\n"
+    "1:"  // Loop
+    "fmla z14.s, p2/M, z13.s, z3.s\n"
+    "ld1w { z15.s }, p2/Z, [%x[params]]\n"
+    "incw x13\n"
+    "fmla z12.s, p2/M, z13.s, z0.s\n"
+    "ldp x26, x23, [%x[inptrs], #0x0]\n"
+    "mov p0.b, p1.b\n"
+    "fmla z10.s, p2/M, z13.s, z31.s\n"
+    "ldp x25, x16, [%x[inptrs], #0x10]\n"
+    "mov x15, x14\n"
+    "fmla z8.s, p2/M, z13.s, z30.s\n"
+    "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "incw x14\n"
+    "fmla z14.s, p2/M, z11.s, z0.s\n"
+    "ldp x24, x12, [%x[inptrs], #0x20]\n"
+    "whilelt p1.s, x15, %x[n_channels]\n"
+    "fmla z12.s, p2/M, z11.s, z29.s\n"
+    "ld1w { z3.s }, p1/Z, [x26, x15, LSL #2]\n"
+    "cmp x14, %x[n_channels]\n"
+    "fmla z10.s, p2/M, z11.s, z30.s\n"
+    "ld1w { z0.s }, p1/Z, [x23, x15, LSL #2]\n"
+    "ldp x23, x11, [%x[inptrs], #0x30]\n"
+    "fmla z8.s, p2/M, z11.s, z28.s\n"
+    "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "fmla z14.s, p2/M, z9.s, z29.s\n"
+    "ld1w { z29.s }, p1/Z, [x24, x15, LSL #2]\n"
+    "fmla z12.s, p2/M, z9.s, z27.s\n"
+    "ld1w { z27.s }, p1/Z, [x23, x15, LSL #2]\n"
+    "fmla z10.s, p2/M, z9.s, z28.s\n"
+    "ldp x10, x9, [%x[inptrs], #0x40]\n"
+    "fmla z8.s, p2/M, z9.s, z26.s\n"
+    "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "fmla z14.s, p2/M, z7.s, z31.s\n"
+    "ld1w { z31.s }, p1/Z, [x25, x15, LSL #2]\n"
+    "fmla z12.s, p2/M, z7.s, z30.s\n"
+    "ldp x28, x27, [%x[inptrs], #0x50]\n"
+    "fmla z10.s, p2/M, z7.s, z25.s\n"
+    "ldp x26, x25, [%x[inptrs], #0x60]\n"
+    "fmla z8.s, p2/M, z7.s, z24.s\n"
+    "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "fmla z14.s, p2/M, z6.s, z30.s\n"
+    "ld1w { z30.s }, p1/Z, [x16, x15, LSL #2]\n"
+    "fmla z12.s, p2/M, z6.s, z28.s\n"
+    "ldp x24, x23, [%x[inptrs], #0x70]\n"
+    "fmla z10.s, p2/M, z6.s, z24.s\n"
+    "fmla z8.s, p2/M, z6.s, z23.s\n"
+    "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z14.s, p2/M, z5.s, z28.s\n"
+    "ld1w { z28.s }, p1/Z, [x12, x15, LSL #2]\n"
+    "fmla z12.s, p2/M, z5.s, z26.s\n"
+    "ld1w { z26.s }, p1/Z, [x11, x15, LSL #2]\n"
+    "fmla z10.s, p2/M, z5.s, z23.s\n"
+    "fmla z8.s, p2/M, z5.s, z22.s\n"
+    "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "fmla z14.s, p2/M, z4.s, z25.s\n"
+    "ld1w { z25.s }, p1/Z, [x10, x15, LSL #2]\n"
+    "fmla z12.s, p2/M, z4.s, z24.s\n"
+    "fmla z10.s, p2/M, z4.s, z21.s\n"
+    "ld1w { z21.s }, p1/Z, [x26, x15, LSL #2]\n"
+    "fmla z8.s, p2/M, z4.s, z20.s\n"
+    "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmla z14.s, p2/M, z2.s, z24.s\n"
+    "ld1w { z24.s }, p1/Z, [x9, x15, LSL #2]\n"
+    "fmla z12.s, p2/M, z2.s, z23.s\n"
+    "fmla z10.s, p2/M, z2.s, z20.s\n"
+    "ld1w { z20.s }, p1/Z, [x25, x15, LSL #2]\n"
+    "fmla z8.s, p2/M, z2.s, z19.s\n"
+    "ld1w { z2.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "fmla z14.s, p2/M, z1.s, z23.s\n"
+    "ld1w { z23.s }, p1/Z, [x28, x15, LSL #2]\n"
+    "fmla z12.s, p2/M, z1.s, z22.s\n"
+    "ld1w { z22.s }, p1/Z, [x27, x15, LSL #2]\n"
+    "fmla z10.s, p2/M, z1.s, z19.s\n"
+    "ld1w { z19.s }, p1/Z, [x24, x15, LSL #2]\n"
+    "fmla z8.s, p2/M, z1.s, z18.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "fmax z14.s, p2/M, z14.s, z17.s\n"
+    "ld1w { z18.s }, p1/Z, [x23, x15, LSL #2]\n"
+    "fmax z12.s, p2/M, z12.s, z17.s\n"
+    "fmax z10.s, p2/M, z10.s, z17.s\n"
+    "fmax z8.s, p2/M, z8.s, z17.s\n"
+    "fmin z14.s, p2/M, z14.s, z16.s\n"
+    "st1w { z14.s }, p0, [x22, x13, LSL #2]\n"
+    "mov z14.d, z15.d\n"
+    "fmin z12.s, p2/M, z12.s, z16.s\n"
+    "st1w { z12.s }, p0, [x21, x13, LSL #2]\n"
+    "mov z12.d, z15.d\n"
+    "fmin z10.s, p2/M, z10.s, z16.s\n"
+    "st1w { z10.s }, p0, [x20, x13, LSL #2]\n"
+    "mov z10.d, z15.d\n"
+    "fmin z8.s, p2/M, z8.s, z16.s\n"
+    "st1w { z8.s }, p0, [x19, x13, LSL #2]\n"
+    "mov z8.d, z15.d\n"
+    "blt 1b\n"
+    "2:"  // Tail
+    "fmla z14.s, p2/M, z13.s, z3.s\n"
+    "incw x13\n"
+    "fmla z12.s, p2/M, z13.s, z0.s\n"
+    "mov p0.b, p1.b\n"
+    "fmla z10.s, p2/M, z13.s, z31.s\n"
+    "fmla z8.s, p2/M, z13.s, z30.s\n"
+    "fmla z14.s, p2/M, z11.s, z0.s\n"
+    "fmla z12.s, p2/M, z11.s, z29.s\n"
+    "fmla z10.s, p2/M, z11.s, z30.s\n"
+    "fmla z8.s, p2/M, z11.s, z28.s\n"
+    "fmla z14.s, p2/M, z9.s, z29.s\n"
+    "fmla z12.s, p2/M, z9.s, z27.s\n"
+    "fmla z10.s, p2/M, z9.s, z28.s\n"
+    "fmla z8.s, p2/M, z9.s, z26.s\n"
+    "fmla z14.s, p2/M, z7.s, z31.s\n"
+    "fmla z12.s, p2/M, z7.s, z30.s\n"
+    "fmla z10.s, p2/M, z7.s, z25.s\n"
+    "fmla z8.s, p2/M, z7.s, z24.s\n"
+    "fmla z14.s, p2/M, z6.s, z30.s\n"
+    "fmla z12.s, p2/M, z6.s, z28.s\n"
+    "fmla z10.s, p2/M, z6.s, z24.s\n"
+    "fmla z8.s, p2/M, z6.s, z23.s\n"
+    "fmla z14.s, p2/M, z5.s, z28.s\n"
+    "fmla z12.s, p2/M, z5.s, z26.s\n"
+    "fmla z10.s, p2/M, z5.s, z23.s\n"
+    "fmla z8.s, p2/M, z5.s, z22.s\n"
+    "fmla z14.s, p2/M, z4.s, z25.s\n"
+    "fmla z12.s, p2/M, z4.s, z24.s\n"
+    "fmla z10.s, p2/M, z4.s, z21.s\n"
+    "fmla z8.s, p2/M, z4.s, z20.s\n"
+    "fmla z14.s, p2/M, z2.s, z24.s\n"
+    "fmla z12.s, p2/M, z2.s, z23.s\n"
+    "fmla z10.s, p2/M, z2.s, z20.s\n"
+    "fmla z8.s, p2/M, z2.s, z19.s\n"
+    "fmla z14.s, p2/M, z1.s, z23.s\n"
+    "fmla z12.s, p2/M, z1.s, z22.s\n"
+    "fmla z10.s, p2/M, z1.s, z19.s\n"
+    "fmla z8.s, p2/M, z1.s, z18.s\n"
+    "fmax z14.s, p2/M, z14.s, z17.s\n"
+    "fmax z12.s, p2/M, z12.s, z17.s\n"
+    "fmax z10.s, p2/M, z10.s, z17.s\n"
+    "fmax z8.s, p2/M, z8.s, z17.s\n"
+    "fmin z14.s, p2/M, z14.s, z16.s\n"
+    "st1w { z14.s }, p0, [x22, x13, LSL #2]\n"
+    "fmin z12.s, p2/M, z12.s, z16.s\n"
+    "fmin z10.s, p2/M, z10.s, z16.s\n"
+    "st1w { z12.s }, p0, [x21, x13, LSL #2]\n"
+    "fmin z8.s, p2/M, z8.s, z16.s\n"
+    "st1w { z10.s }, p0, [x20, x13, LSL #2]\n"
+    "st1w { z8.s }, p0, [x19, x13, LSL #2]\n"
+    : [params] "+r" (params)
+    : [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((unsigned long) n_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..d899255
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x8, #0x0\n"
+    "mov x17, #0x0\n"
+    "1:"  // Tile loop
+    "str x8, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x21, #0x2\n"
+    "str x17, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "cntb x16\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x16, x16, XZR, LSL #4\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cntb x14\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "cntb x12\n"
+    "ldr x11, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x8, x20\n" // offset = tile_i * ld_input_row
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x17, x13, x19\n" // offset += tile_j * ld_input_col
+    "ldr x10, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x21\n" // offset *= kernel_stride * output_size
+    "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x11, x11, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x28, x11, x20, LSL #2\n"
+    "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x27, x28, x20, LSL #2\n"
+    "ld1w { z16.s }, p3/Z, [x15]\n"
+    "mov z31.d, z16.d\n"
+    "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
+    "add x26, x27, x20, LSL #2\n"
+    "mov z30.d, z16.d\n"
+    "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
+    "add x25, x13, x13\n"
+    "mov z29.d, z16.d\n"
+    "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
+    "add x24, x25, x13\n"
+    "mov z28.d, z16.d\n"
+    "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
+    "add x14, x14, x13, LSL #4\n"
+    "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
+    "add x12, x12, x25, LSL #4\n"
+    "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
+    "cntb x23\n"
+    "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
+    "add x23, x23, x24, LSL #4\n"
+    "prfm pldl1keep, [x28, x14]\n"
+    "mov x20, #0x2\n"
+    "prfm pldl1keep, [x11, x16]\n"
+    "mul x19, x8, x22\n" // offset = tile_i * ld_output_row
+    "prfm pldl1keep, [x11, x23]\n"
+    "madd x19, x17, x10, x19\n" // offset += tile_j * ld_output_col
+    "prfm pldl1keep, [x28, x12]\n"
+    "mul x19, x19, x20\n" // offset *= output_tile_size
+    "prfm pldl1keep, [x27, x14]\n"
+    "add x9, x9, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "mov x21, #0x0\n"
+    "add x22, x9, x22, LSL #2\n"
+    "cntw x20\n"
+    "sub x19, XZR, x20\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z9.s }, p2/Z, [x28, x13, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x11]\n"
+    "addvl x15, x15, #16\n"
+    "ld1w { z11.s }, p2/Z, [x11, x24, LSL #2]\n"
+    "cmp x20, %x[n_channels]\n"
+    "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
+    "addvl x15, x15, #-6\n"
+    "ld1w { z12.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x26, x16]\n"
+    "whilelt p1.s, x20, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x26, x23]\n"
+    "incw x19\n"
+    "fmla z29.s, p3/M, z1.s, z9.s\n"
+    "prfm pldl1keep, [x11, x14]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x26]\n"
+    "incw x21\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "prfm pldl1keep, [x11, x12]\n"
+    "incw x20\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x24, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x27, x25, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "prfm pldl1keep, [x27, x12]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x28, x16]\n"
+    "fmla z31.s, p3/M, z5.s, z12.s\n"
+    "prfm pldl1keep, [x28, x23]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x11, x25, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z13.s\n"
+    "addvl x11, x11, #1\n"
+    "fmla z31.s, p3/M, z7.s, z13.s\n"
+    "prfm pldl1keep, [x27, x16]\n"
+    "prfm pldl1keep, [x27, x23]\n"
+    "fmla z30.s, p3/M, z6.s, z13.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x26, x12]\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x28]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z16.s }, p3/Z, [x15]\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z10.s\n"
+    "addvl x28, x28, #1\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "prfm pldl1keep, [x28, x14]\n"
+    "prfm pldl1keep, [x11, x16]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "prfm pldl1keep, [x11, x23]\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x27]\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "prfm pldl1keep, [x28, x12]\n"
+    "fmla z28.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n"
+    "fmla z31.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n"
+    "fmla z30.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z9.s\n"
+    "addvl x27, x27, #1\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z13.s }, p1/Z, [x27, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z29.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x27, x14]\n"
+    "addvl x26, x26, #1\n"
+    "fmla z31.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z9.s }, p1/Z, [x28, x13, LSL #2]\n"
+    "cmp x20, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z10.s }, p1/Z, [x11]\n"
+    "fmla z28.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x11, x24, LSL #2]\n"
+    "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "ld1w { z12.s }, p1/Z, [x28, x25, LSL #2]\n"
+    "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "addvl x15, x15, #16\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n"
+    "addvl x15, x15, #-6\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z31.s }, p0, [x9]\n"
+    "mov z31.d, z16.d\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z30.s }, p0, [x9, x10, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "mov z30.d, z16.d\n"
+    "st1w { z29.s }, p0, [x22]\n"
+    "mov z29.d, z16.d\n"
+    "st1w { z28.s }, p0, [x22, x10, LSL #2]\n"
+    "mov z28.d, z16.d\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x26, x16]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x26, x23]\n"
+    "fmla z29.s, p3/M, z1.s, z9.s\n"
+    "prfm pldl1keep, [x11, x14]\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x26]\n"
+    "prfm pldl1keep, [x11, x12]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x25, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x24, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x27, x12]\n"
+    "prfm pldl1keep, [x28, x16]\n"
+    "fmla z31.s, p3/M, z5.s, z12.s\n"
+    "prfm pldl1keep, [x28, x23]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z9.s }, p2/Z, [x11, x25, LSL #2]\n"
+    "prfm pldl1keep, [x27, x16]\n"
+    "fmla z31.s, p3/M, z7.s, z13.s\n"
+    "prfm pldl1keep, [x27, x23]\n"
+    "fmla z30.s, p3/M, z6.s, z13.s\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x28]\n"
+    "prfm pldl1keep, [x26, x12]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "add x21, x8, #0x1\n"
+    "fmla z29.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "ldr x17, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x17, x17, #0x1\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x27]\n"
+    "fmla z31.s, p3/M, z8.s, z10.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x17, x19\n"
+    "fmla z30.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "csel x17, x17, XZR, LT\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "csel x8, x8, x21, LT\n"
+    "fmla z28.s, p3/M, z2.s, z12.s\n"
+    "cmp x8, x20\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z9.s\n"
+    "fmla z29.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z5.s, z10.s\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "fmla z29.s, p3/M, z7.s, z11.s\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "fmla z28.s, p3/M, z6.s, z11.s\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "st1w { z31.s }, p0, [x9]\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "st1w { z30.s }, p0, [x9, x10, LSL #2]\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z29.s }, p0, [x22]\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z28.s }, p0, [x22, x10, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..e8a1539
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[16];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[2];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[7];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[10];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[12];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x2, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x19, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cntb x4, ALL, MUL #2\n"
+    "ldp x5, x6, [x19, #0x0]\n"
+    "mov x7, #0x0\n"
+    "ldp x8, x17, [x19, #0x10]\n"
+    "cntw x16\n"
+    "ldp x15, x14, [x19, #0x20]\n"
+    "sub x13, XZR, x16\n"
+    "ldp x12, x11, [x19, #0x30]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ldp x10, x9, [x19, #0x40]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ldp x28, x27, [x19, #0x50]\n"
+    "ldp x26, x25, [x19, #0x60]\n"
+    "ldp x24, x23, [x19, #0x70]\n"
+    "ldp x22, x21, [x2, #0x0]\n"
+    "ldp x20, x19, [x2, #0x10]\n"
+    "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "ld1w { z16.s }, p3/Z, [x3]\n"
+    "mov z31.d, z16.d\n"
+    "ld1w { z0.s }, p3/Z, [x3, #1, MUL VL]\n"
+    "mov z30.d, z16.d\n"
+    "ld1w { z1.s }, p3/Z, [x3, #2, MUL VL]\n"
+    "mov z29.d, z16.d\n"
+    "ld1w { z2.s }, p3/Z, [x3, #3, MUL VL]\n"
+    "mov z28.d, z16.d\n"
+    "ld1w { z3.s }, p3/Z, [x3, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x3, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x3, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x3, #7, MUL VL]\n"
+    "addvl x3, x3, #16\n"
+    "ld1w { z9.s }, p2/Z, [x14, x7, LSL #2]\n"
+    "ld1w { z7.s }, p3/Z, [x3, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x3, #-7, MUL VL]\n"
+    "addvl x3, x3, #-6\n"
+    "prfm pldl1keep, [x14, x4]\n"
+    "ld1w { z10.s }, p2/Z, [x5, x7, LSL #2]\n"
+    "prfm pldl1keep, [x5, x4]\n"
+    "ld1w { z11.s }, p2/Z, [x17, x7, LSL #2]\n"
+    "prfm pldl1keep, [x17, x4]\n"
+    "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n"
+    "prfm pldl1keep, [x12, x4]\n"
+    "ld1w { z13.s }, p2/Z, [x9, x7, LSL #2]\n"
+    "prfm pldl1keep, [x9, x4]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x26, x4]\n"
+    "whilelt p1.s, x16, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x23, x4]\n"
+    "incw x13\n"
+    "fmla z29.s, p3/M, z1.s, z9.s\n"
+    "prfm pldl1keep, [x6, x4]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x26, x7, LSL #2]\n"
+    "prfm pldl1keep, [x8, x4]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "prfm pldl1keep, [x28, x4]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x15, x4]\n"
+    "fmla z31.s, p3/M, z5.s, z12.s\n"
+    "prfm pldl1keep, [x11, x4]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x6, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z13.s\n"
+    "prfm pldl1keep, [x10, x4]\n"
+    "fmla z31.s, p3/M, z7.s, z13.s\n"
+    "prfm pldl1keep, [x27, x4]\n"
+    "fmla z30.s, p3/M, z6.s, z13.s\n"
+    "prfm pldl1keep, [x25, x4]\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x24, x4]\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "addvl x4, x4, #1\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z11.s }, p2/Z, [x15, x7, LSL #2]\n"
+    "prfm pldl1keep, [x14, x4]\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z10.s\n"
+    "prfm pldl1keep, [x5, x4]\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "prfm pldl1keep, [x17, x4]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "prfm pldl1keep, [x12, x4]\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x10, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z13.s }, p1/Z, [x9, x16, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z12.s\n"
+    "prfm pldl1keep, [x9, x4]\n"
+    "fmla z31.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z16.s }, p3/Z, [x3]\n"
+    "fmla z30.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z9.s\n"
+    "ld1w { z0.s }, p3/Z, [x3, #1, MUL VL]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x7, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z1.s }, p3/Z, [x3, #2, MUL VL]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+    "incw x7\n"
+    "fmla z29.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z2.s }, p3/Z, [x3, #3, MUL VL]\n"
+    "whilelt p2.s, x7, %x[n_channels]\n"
+    "fmla z31.s, p3/M, z6.s, z9.s\n"
+    "ld1w { z9.s }, p1/Z, [x14, x16, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x17, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z10.s }, p1/Z, [x5, x16, LSL #2]\n"
+    "ld1w { z3.s }, p3/Z, [x3, #4, MUL VL]\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z4.s }, p3/Z, [x3, #5, MUL VL]\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "ld1w { z12.s }, p1/Z, [x12, x16, LSL #2]\n"
+    "incw x16\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "ld1w { z5.s }, p3/Z, [x3, #6, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "ld1w { z6.s }, p3/Z, [x3, #7, MUL VL]\n"
+    "addvl x3, x3, #16\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "ld1w { z7.s }, p3/Z, [x3, #-8, MUL VL]\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "ld1w { z8.s }, p3/Z, [x3, #-7, MUL VL]\n"
+    "addvl x3, x3, #-6\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "st1w { z31.s }, p0, [x22, x13, LSL #2]\n"
+    "mov z31.d, z16.d\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+    "mov z30.d, z16.d\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z29.s }, p0, [x20, x13, LSL #2]\n"
+    "mov z29.d, z16.d\n"
+    "st1w { z28.s }, p0, [x19, x13, LSL #2]\n"
+    "mov z28.d, z16.d\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x26, x4]\n"
+    "incw x13\n"
+    "fmla z30.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x23, x4]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.s, p3/M, z1.s, z9.s\n"
+    "prfm pldl1keep, [x6, x4]\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x26, x7, LSL #2]\n"
+    "prfm pldl1keep, [x8, x4]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x28, x4]\n"
+    "prfm pldl1keep, [x15, x4]\n"
+    "fmla z31.s, p3/M, z5.s, z12.s\n"
+    "prfm pldl1keep, [x11, x4]\n"
+    "fmla z30.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x6, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "fmla z28.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+    "prfm pldl1keep, [x10, x4]\n"
+    "fmla z31.s, p3/M, z7.s, z13.s\n"
+    "prfm pldl1keep, [x27, x4]\n"
+    "fmla z30.s, p3/M, z6.s, z13.s\n"
+    "prfm pldl1keep, [x25, x4]\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x15, x7, LSL #2]\n"
+    "prfm pldl1keep, [x24, x4]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z10.s\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x10, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "fmla z28.s, p3/M, z2.s, z12.s\n"
+    "fmla z31.s, p3/M, z8.s, z10.s\n"
+    "fmla z30.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x27, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z9.s\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x7, LSL #2]\n"
+    "fmla z28.s, p3/M, z5.s, z10.s\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z7.s, z11.s\n"
+    "fmla z31.s, p3/M, z6.s, z9.s\n"
+    "fmla z28.s, p3/M, z6.s, z11.s\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "st1w { z31.s }, p0, [x22, x13, LSL #2]\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "st1w { z29.s }, p0, [x20, x13, LSL #2]\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z28.s }, p0, [x19, x13, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp
new file mode 100644
index 0000000..173fc63
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl(const float *const, const size_t, const size_t, float *const, const size_t, const size_t, const void *, unsigned long, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided
+{
+  typedef float bias_type;
+  typedef float operand_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const, const size_t, const size_t, float *const, const size_t, const size_t, const void *, unsigned long, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  kern_type kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl;
+
+  sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp
new file mode 100644
index 0000000..cecc192
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp

@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl(
+  const float *const inptr,
+  const size_t in_row_stride,
+  const size_t in_col_stride,
+  float *const outptr,
+  const size_t out_row_stride,
+  const size_t out_col_stride,
+  const void *params,
+  unsigned long n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ptrue p2.b\n"
+    "ld1w { z15.s }, p2/Z, [%x[params]]\n"
+    "mov z14.d, z15.d\n"
+    "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "mov z12.d, z15.d\n"
+    "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "mov x26, %x[inptr]\n"
+    "mov z10.d, z15.d\n"
+    "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "add x25, x26, %x[in_row_stride], LSL #2\n"
+    "mov z8.d, z15.d\n"
+    "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "add x24, x25, %x[in_row_stride], LSL #2\n"
+    "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "add x23, x24, %x[in_row_stride], LSL #2\n"
+    "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "mov x22, %x[outptr]\n"
+    "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "add x21, x22, %x[out_row_stride], LSL #2\n"
+    "ld1w { z3.s }, p1/Z, [x26]\n"
+    "add x20, %x[in_col_stride], %x[in_col_stride]\n"
+    "ld1w { z2.s }, p1/Z, [x26, %x[in_col_stride], LSL #2]\n"
+    "add x19, x20, %x[in_col_stride]\n"
+    "ld1w { z1.s }, p1/Z, [x25]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "ld1w { z0.s }, p1/Z, [x25, %x[in_col_stride], LSL #2]\n"
+    "decw %x[n_channels]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "cmp %x[n_channels], XZR\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "ld1w { z29.s }, p1/Z, [x26, x20, LSL #2]\n"
+    "ld1w { z28.s }, p1/Z, [x25, x20, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x26, x19, LSL #2]\n"
+    "ld1w { z26.s }, p1/Z, [x25, x19, LSL #2]\n"
+    "ld1w { z25.s }, p1/Z, [x24]\n"
+    "ld1w { z24.s }, p1/Z, [x24, %x[in_col_stride], LSL #2]\n"
+    "ld1w { z23.s }, p1/Z, [x24, x20, LSL #2]\n"
+    "ld1w { z22.s }, p1/Z, [x24, x19, LSL #2]\n"
+    "ld1w { z21.s }, p1/Z, [x23]\n"
+    "ld1w { z20.s }, p1/Z, [x23, %x[in_col_stride], LSL #2]\n"
+    "ld1w { z19.s }, p1/Z, [x23, x20, LSL #2]\n"
+    "ld1w { z18.s }, p1/Z, [x23, x19, LSL #2]\n"
+    "ld1rw { z17.s }, p2/Z, [%x[minmax_vals]]\n"
+    "ld1rw { z16.s }, p2/Z, [%x[minmax_vals], #4]\n"
+    "ble 2f\n"
+    "1:"  // Loop
+    "fmla z14.s, p2/M, z13.s, z3.s\n"
+    "ld1w { z15.s }, p2/Z, [%x[params]]\n"
+    "addvl x26, x26, #1\n"
+    "fmla z12.s, p2/M, z13.s, z2.s\n"
+    "addvl x25, x25, #1\n"
+    "fmla z10.s, p2/M, z13.s, z1.s\n"
+    "addvl x24, x24, #1\n"
+    "fmla z8.s, p2/M, z13.s, z0.s\n"
+    "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "addvl x23, x23, #1\n"
+    "fmla z14.s, p2/M, z11.s, z2.s\n"
+    "decw %x[n_channels]\n"
+    "mov p0.b, p1.b\n"
+    "fmla z12.s, p2/M, z11.s, z29.s\n"
+    "fmla z10.s, p2/M, z11.s, z0.s\n"
+    "whilelt p1.s, XZR, %x[n_channels]\n"
+    "ld1w { z3.s }, p1/Z, [x26]\n"
+    "fmla z8.s, p2/M, z11.s, z28.s\n"
+    "cmp %x[n_channels], XZR\n"
+    "fmla z14.s, p2/M, z9.s, z29.s\n"
+    "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "ld1w { z2.s }, p1/Z, [x26, %x[in_col_stride], LSL #2]\n"
+    "fmla z12.s, p2/M, z9.s, z27.s\n"
+    "fmla z10.s, p2/M, z9.s, z28.s\n"
+    "ld1w { z29.s }, p1/Z, [x26, x20, LSL #2]\n"
+    "ld1w { z27.s }, p1/Z, [x26, x19, LSL #2]\n"
+    "fmla z8.s, p2/M, z9.s, z26.s\n"
+    "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "fmla z14.s, p2/M, z7.s, z1.s\n"
+    "ld1w { z1.s }, p1/Z, [x25]\n"
+    "fmla z12.s, p2/M, z7.s, z0.s\n"
+    "fmla z10.s, p2/M, z7.s, z25.s\n"
+    "fmla z8.s, p2/M, z7.s, z24.s\n"
+    "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "fmla z14.s, p2/M, z6.s, z0.s\n"
+    "ld1w { z0.s }, p1/Z, [x25, %x[in_col_stride], LSL #2]\n"
+    "fmla z12.s, p2/M, z6.s, z28.s\n"
+    "fmla z10.s, p2/M, z6.s, z24.s\n"
+    "fmla z8.s, p2/M, z6.s, z23.s\n"
+    "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z14.s, p2/M, z5.s, z28.s\n"
+    "ld1w { z28.s }, p1/Z, [x25, x20, LSL #2]\n"
+    "fmla z12.s, p2/M, z5.s, z26.s\n"
+    "ld1w { z26.s }, p1/Z, [x25, x19, LSL #2]\n"
+    "fmla z10.s, p2/M, z5.s, z23.s\n"
+    "fmla z8.s, p2/M, z5.s, z22.s\n"
+    "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "fmla z14.s, p2/M, z4.s, z25.s\n"
+    "ld1w { z25.s }, p1/Z, [x24]\n"
+    "fmla z12.s, p2/M, z4.s, z24.s\n"
+    "fmla z10.s, p2/M, z4.s, z21.s\n"
+    "ld1w { z21.s }, p1/Z, [x23]\n"
+    "fmla z8.s, p2/M, z4.s, z20.s\n"
+    "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmla z14.s, p2/M, z31.s, z24.s\n"
+    "ld1w { z24.s }, p1/Z, [x24, %x[in_col_stride], LSL #2]\n"
+    "fmla z12.s, p2/M, z31.s, z23.s\n"
+    "fmla z10.s, p2/M, z31.s, z20.s\n"
+    "ld1w { z20.s }, p1/Z, [x23, %x[in_col_stride], LSL #2]\n"
+    "fmla z8.s, p2/M, z31.s, z19.s\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "fmla z14.s, p2/M, z30.s, z23.s\n"
+    "ld1w { z23.s }, p1/Z, [x24, x20, LSL #2]\n"
+    "fmla z12.s, p2/M, z30.s, z22.s\n"
+    "ld1w { z22.s }, p1/Z, [x24, x19, LSL #2]\n"
+    "fmla z10.s, p2/M, z30.s, z19.s\n"
+    "ld1w { z19.s }, p1/Z, [x23, x20, LSL #2]\n"
+    "fmla z8.s, p2/M, z30.s, z18.s\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "fmax z14.s, p2/M, z14.s, z17.s\n"
+    "ld1w { z18.s }, p1/Z, [x23, x19, LSL #2]\n"
+    "fmax z12.s, p2/M, z12.s, z17.s\n"
+    "fmax z10.s, p2/M, z10.s, z17.s\n"
+    "fmax z8.s, p2/M, z8.s, z17.s\n"
+    "fmin z14.s, p2/M, z14.s, z16.s\n"
+    "st1w { z14.s }, p0, [x22]\n"
+    "mov z14.d, z15.d\n"
+    "fmin z12.s, p2/M, z12.s, z16.s\n"
+    "st1w { z12.s }, p0, [x22, %x[out_col_stride], LSL #2]\n"
+    "mov z12.d, z15.d\n"
+    "addvl x22, x22, #1\n"
+    "fmin z10.s, p2/M, z10.s, z16.s\n"
+    "st1w { z10.s }, p0, [x21]\n"
+    "mov z10.d, z15.d\n"
+    "fmin z8.s, p2/M, z8.s, z16.s\n"
+    "st1w { z8.s }, p0, [x21, %x[out_col_stride], LSL #2]\n"
+    "mov z8.d, z15.d\n"
+    "addvl x21, x21, #1\n"
+    "bgt 1b\n"
+    "2:"  // Tail
+    "fmla z14.s, p2/M, z13.s, z3.s\n"
+    "mov p0.b, p1.b\n"
+    "fmla z12.s, p2/M, z13.s, z2.s\n"
+    "fmla z10.s, p2/M, z13.s, z1.s\n"
+    "fmla z8.s, p2/M, z13.s, z0.s\n"
+    "fmla z14.s, p2/M, z11.s, z2.s\n"
+    "fmla z12.s, p2/M, z11.s, z29.s\n"
+    "fmla z10.s, p2/M, z11.s, z0.s\n"
+    "fmla z8.s, p2/M, z11.s, z28.s\n"
+    "fmla z14.s, p2/M, z9.s, z29.s\n"
+    "fmla z12.s, p2/M, z9.s, z27.s\n"
+    "fmla z10.s, p2/M, z9.s, z28.s\n"
+    "fmla z8.s, p2/M, z9.s, z26.s\n"
+    "fmla z14.s, p2/M, z7.s, z1.s\n"
+    "fmla z12.s, p2/M, z7.s, z0.s\n"
+    "fmla z10.s, p2/M, z7.s, z25.s\n"
+    "fmla z8.s, p2/M, z7.s, z24.s\n"
+    "fmla z14.s, p2/M, z6.s, z0.s\n"
+    "fmla z12.s, p2/M, z6.s, z28.s\n"
+    "fmla z10.s, p2/M, z6.s, z24.s\n"
+    "fmla z8.s, p2/M, z6.s, z23.s\n"
+    "fmla z14.s, p2/M, z5.s, z28.s\n"
+    "fmla z12.s, p2/M, z5.s, z26.s\n"
+    "fmla z10.s, p2/M, z5.s, z23.s\n"
+    "fmla z8.s, p2/M, z5.s, z22.s\n"
+    "fmla z14.s, p2/M, z4.s, z25.s\n"
+    "fmla z12.s, p2/M, z4.s, z24.s\n"
+    "fmla z10.s, p2/M, z4.s, z21.s\n"
+    "fmla z8.s, p2/M, z4.s, z20.s\n"
+    "fmla z14.s, p2/M, z31.s, z24.s\n"
+    "fmla z12.s, p2/M, z31.s, z23.s\n"
+    "fmla z10.s, p2/M, z31.s, z20.s\n"
+    "fmla z8.s, p2/M, z31.s, z19.s\n"
+    "fmla z14.s, p2/M, z30.s, z23.s\n"
+    "fmla z12.s, p2/M, z30.s, z22.s\n"
+    "fmla z10.s, p2/M, z30.s, z19.s\n"
+    "fmla z8.s, p2/M, z30.s, z18.s\n"
+    "fmax z14.s, p2/M, z14.s, z17.s\n"
+    "fmax z12.s, p2/M, z12.s, z17.s\n"
+    "fmax z10.s, p2/M, z10.s, z17.s\n"
+    "fmax z8.s, p2/M, z8.s, z17.s\n"
+    "fmin z14.s, p2/M, z14.s, z16.s\n"
+    "st1w { z14.s }, p0, [x22]\n"
+    "fmin z12.s, p2/M, z12.s, z16.s\n"
+    "fmin z10.s, p2/M, z10.s, z16.s\n"
+    "st1w { z12.s }, p0, [x22, %x[out_col_stride], LSL #2]\n"
+    "fmin z8.s, p2/M, z8.s, z16.s\n"
+    "st1w { z10.s }, p0, [x21]\n"
+    "st1w { z8.s }, p0, [x21, %x[out_col_stride], LSL #2]\n"
+    : [n_channels] "+r" (n_channels), [params] "+r" (params)
+    : [in_col_stride] "r" (in_col_stride), [in_row_stride] "r" (in_row_stride), [inptr] "r" (inptr), [minmax_vals] "r" (minmax_vals), [out_col_stride] "r" (out_col_stride), [out_row_stride] "r" (out_row_stride), [outptr] "r" (outptr)
+    : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000..5ec78aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+  sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..4d0bd31
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x3, #0x0\n"
+    "mov x4, #0x0\n"
+    "1:"  // Tile loop
+    "str x3, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x22, #0x3\n"
+    "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "cntb x5\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x5, x5, XZR, LSL #4\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cntb x7\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "cntb x17\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x3, x20\n" // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x4, x8, x19\n" // offset += tile_j * ld_input_col
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x22\n" // offset *= kernel_stride * output_size
+    "ldr x14, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x16, x16, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x13, x16, x20, LSL #2\n"
+    "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x12, x13, x20, LSL #2\n"
+    "ld1w { z16.s }, p3/Z, [x6]\n"
+    "mov z31.d, z16.d\n"
+    "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n"
+    "add x11, x12, x20, LSL #2\n"
+    "mov z30.d, z16.d\n"
+    "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n"
+    "add x10, x11, x20, LSL #2\n"
+    "mov z29.d, z16.d\n"
+    "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n"
+    "add x9, x8, x8\n"
+    "mov z28.d, z16.d\n"
+    "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n"
+    "add x28, x9, x8\n"
+    "mov z27.d, z16.d\n"
+    "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n"
+    "add x27, x28, x8\n"
+    "mov z26.d, z16.d\n"
+    "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n"
+    "add x7, x7, x8, LSL #4\n"
+    "mov z25.d, z16.d\n"
+    "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n"
+    "add x17, x17, x9, LSL #4\n"
+    "mov z24.d, z16.d\n"
+    "prfm pldl1keep, [x12, x17]\n"
+    "cntb x26\n"
+    "mov z23.d, z16.d\n"
+    "prfm pldl1keep, [x16, x5]\n"
+    "add x26, x26, x28, LSL #4\n"
+    "cntb x25\n"
+    "mov x20, #0x3\n"
+    "add x25, x25, x27, LSL #4\n"
+    "prfm pldl1keep, [x16, x25]\n"
+    "prfm pldl1keep, [x10, x5]\n"
+    "mul x19, x3, x21\n" // offset = tile_i * ld_output_row
+    "prfm pldl1keep, [x13, x17]\n"
+    "madd x19, x4, x15, x19\n" // offset += tile_j * ld_output_col
+    "add x24, x15, x15\n"
+    "mul x19, x19, x20\n" // offset *= output_tile_size
+    "add x14, x14, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "add x23, x14, x21, LSL #2\n"
+    "add x22, x23, x21, LSL #2\n"
+    "mov x21, #0x0\n"
+    "cntw x20\n"
+    "sub x19, XZR, x20\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z9.s }, p2/Z, [x12, x9, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x16]\n"
+    "addvl x6, x6, #16\n"
+    "ld1w { z11.s }, p2/Z, [x16, x27, LSL #2]\n"
+    "cmp x20, %x[n_channels]\n"
+    "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n"
+    "addvl x6, x6, #-6\n"
+    "ld1w { z12.s }, p2/Z, [x10]\n"
+    "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "prfm pldl1keep, [x10, x25]\n"
+    "whilelt p1.s, x20, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z7.s, z9.s\n"
+    "prfm pldl1keep, [x12, x7]\n"
+    "incw x19\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "prfm pldl1keep, [x16, x7]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z5.s, z9.s\n"
+    "prfm pldl1keep, [x16, x26]\n"
+    "incw x21\n"
+    "fmla z27.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x12, x26]\n"
+    "incw x20\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x13, x5]\n"
+    "fmla z25.s, p3/M, z2.s, z9.s\n"
+    "prfm pldl1keep, [x13, x25]\n"
+    "fmla z24.s, p3/M, z1.s, z9.s\n"
+    "prfm pldl1keep, [x11, x5]\n"
+    "fmla z23.s, p3/M, z0.s, z9.s\n"
+    "prfm pldl1keep, [x11, x17]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x8, LSL #2]\n"
+    "fmla z25.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x10, x27, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x11, x25]\n"
+    "fmla z31.s, p3/M, z5.s, z13.s\n"
+    "prfm pldl1keep, [x10, x7]\n"
+    "fmla z29.s, p3/M, z3.s, z13.s\n"
+    "prfm pldl1keep, [x13, x7]\n"
+    "fmla z28.s, p3/M, z2.s, z13.s\n"
+    "prfm pldl1keep, [x13, x26]\n"
+    "fmla z27.s, p3/M, z1.s, z13.s\n"
+    "prfm pldl1keep, [x10, x26]\n"
+    "fmla z26.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z23.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x16, x28, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x11, x7]\n"
+    "fmla z30.s, p3/M, z6.s, z11.s\n"
+    "prfm pldl1keep, [x16, x17]\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "prfm pldl1keep, [x11, x26]\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "prfm pldl1keep, [x12, x5]\n"
+    "fmla z25.s, p3/M, z1.s, z11.s\n"
+    "prfm pldl1keep, [x12, x25]\n"
+    "fmla z24.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13]\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "prfm pldl1keep, [x10, x17]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x13, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z16.s }, p3/Z, [x6]\n"
+    "fmla z27.s, p3/M, z5.s, z10.s\n"
+    "fmla z26.s, p3/M, z4.s, z10.s\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11]\n"
+    "fmla z29.s, p3/M, z7.s, z10.s\n"
+    "fmla z24.s, p3/M, z2.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x11, x9, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "fmla z28.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z13.s\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x10, x8, LSL #2]\n"
+    "fmla z25.s, p3/M, z3.s, z12.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x13, x8, LSL #2]\n"
+    "fmla z27.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z6.s, z10.s\n"
+    "fmla z25.s, p3/M, z5.s, z10.s\n"
+    "fmla z28.s, p3/M, z8.s, z10.s\n"
+    "fmla z24.s, p3/M, z4.s, z10.s\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "fmla z26.s, p3/M, z8.s, z11.s\n"
+    "fmla z25.s, p3/M, z7.s, z13.s\n"
+    "fmla z24.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x10, x28, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "fmla z27.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x8, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "fmla z30.s, p3/M, z5.s, z11.s\n"
+    "fmla z26.s, p3/M, z1.s, z11.s\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x9, LSL #2]\n"
+    "addvl x16, x16, #1\n"
+    "fmla z24.s, p3/M, z8.s, z13.s\n"
+    "ld1w { z10.s }, p1/Z, [x16]\n"
+    "fmla z23.s, p3/M, z7.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x11, x28, LSL #2]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12]\n"
+    "fmla z31.s, p3/M, z2.s, z11.s\n"
+    "fmla z30.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z27.s, p3/M, z8.s, z13.s\n"
+    "ld1w { z9.s }, p1/Z, [x12, x9, LSL #2]\n"
+    "fmla z26.s, p3/M, z7.s, z13.s\n"
+    "prfm pldl1keep, [x12, x17]\n"
+    "fmla z24.s, p3/M, z5.s, z13.s\n"
+    "prfm pldl1keep, [x16, x5]\n"
+    "fmla z23.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmla z31.s, p3/M, z6.s, z12.s\n"
+    "prfm pldl1keep, [x16, x25]\n"
+    "addvl x10, x10, #1\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x10, x5]\n"
+    "cmp x20, %x[n_channels]\n"
+    "fmla z25.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p1/Z, [x10]\n"
+    "fmla z29.s, p3/M, z8.s, z11.s\n"
+    "prfm pldl1keep, [x13, x17]\n"
+    "fmla z26.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x16, x27, LSL #2]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n"
+    "fmla z25.s, p3/M, z8.s, z13.s\n"
+    "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z13.s }, p1/Z, [x13, x9, LSL #2]\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n"
+    "addvl x6, x6, #16\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n"
+    "addvl x6, x6, #-6\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "st1w { z31.s }, p0, [x14]\n"
+    "mov z31.d, z16.d\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z30.s }, p0, [x14, x15, LSL #2]\n"
+    "mov z30.d, z16.d\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
+    "mov z29.d, z16.d\n"
+    "addvl x14, x14, #1\n"
+    "fmax z27.s, p3/M, z27.s, z18.s\n"
+    "st1w { z28.s }, p0, [x23]\n"
+    "mov z28.d, z16.d\n"
+    "fmax z26.s, p3/M, z26.s, z18.s\n"
+    "fmax z25.s, p3/M, z25.s, z18.s\n"
+    "fmax z24.s, p3/M, z24.s, z18.s\n"
+    "fmin z27.s, p3/M, z27.s, z17.s\n"
+    "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+    "mov z27.d, z16.d\n"
+    "fmin z26.s, p3/M, z26.s, z17.s\n"
+    "st1w { z26.s }, p0, [x23, x24, LSL #2]\n"
+    "mov z26.d, z16.d\n"
+    "addvl x23, x23, #1\n"
+    "fmin z25.s, p3/M, z25.s, z17.s\n"
+    "st1w { z25.s }, p0, [x22]\n"
+    "mov z25.d, z16.d\n"
+    "fmin z24.s, p3/M, z24.s, z17.s\n"
+    "st1w { z24.s }, p0, [x22, x15, LSL #2]\n"
+    "mov z24.d, z16.d\n"
+    "fmax z23.s, p3/M, z23.s, z18.s\n"
+    "fmin z23.s, p3/M, z23.s, z17.s\n"
+    "st1w { z23.s }, p0, [x22, x24, LSL #2]\n"
+    "mov z23.d, z16.d\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "prfm pldl1keep, [x10, x25]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z7.s, z9.s\n"
+    "prfm pldl1keep, [x12, x7]\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "prfm pldl1keep, [x16, x7]\n"
+    "fmla z28.s, p3/M, z5.s, z9.s\n"
+    "prfm pldl1keep, [x16, x26]\n"
+    "fmla z27.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x12, x26]\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x13, x5]\n"
+    "fmla z25.s, p3/M, z2.s, z9.s\n"
+    "prfm pldl1keep, [x13, x25]\n"
+    "fmla z24.s, p3/M, z1.s, z9.s\n"
+    "prfm pldl1keep, [x11, x5]\n"
+    "fmla z23.s, p3/M, z0.s, z9.s\n"
+    "prfm pldl1keep, [x11, x17]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x8, LSL #2]\n"
+    "fmla z25.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x10, x27, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x11, x25]\n"
+    "fmla z31.s, p3/M, z5.s, z13.s\n"
+    "prfm pldl1keep, [x10, x7]\n"
+    "fmla z29.s, p3/M, z3.s, z13.s\n"
+    "prfm pldl1keep, [x13, x7]\n"
+    "fmla z28.s, p3/M, z2.s, z13.s\n"
+    "prfm pldl1keep, [x13, x26]\n"
+    "fmla z27.s, p3/M, z1.s, z13.s\n"
+    "prfm pldl1keep, [x10, x26]\n"
+    "fmla z26.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x16, x8, LSL #2]\n"
+    "fmla z23.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x16, x28, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x11, x7]\n"
+    "fmla z30.s, p3/M, z6.s, z11.s\n"
+    "prfm pldl1keep, [x16, x17]\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "prfm pldl1keep, [x11, x26]\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "prfm pldl1keep, [x12, x5]\n"
+    "fmla z25.s, p3/M, z1.s, z11.s\n"
+    "prfm pldl1keep, [x12, x25]\n"
+    "fmla z24.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13]\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "prfm pldl1keep, [x10, x17]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x13, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x3, #0x1\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11]\n"
+    "fmla z29.s, p3/M, z7.s, z10.s\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x4, x4, #0x1\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z27.s, p3/M, z5.s, z10.s\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x4, x19\n"
+    "fmla z26.s, p3/M, z4.s, z10.s\n"
+    "fmla z24.s, p3/M, z2.s, z10.s\n"
+    "csel x4, x4, XZR, LT\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x11, x9, LSL #2]\n"
+    "csel x3, x3, x21, LT\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "cmp x3, x20\n"
+    "fmla z28.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z13.s\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x10, x8, LSL #2]\n"
+    "fmla z25.s, p3/M, z3.s, z12.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x13, x8, LSL #2]\n"
+    "fmla z27.s, p3/M, z7.s, z10.s\n"
+    "fmla z26.s, p3/M, z6.s, z10.s\n"
+    "fmla z25.s, p3/M, z5.s, z10.s\n"
+    "fmla z28.s, p3/M, z8.s, z10.s\n"
+    "fmla z24.s, p3/M, z4.s, z10.s\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "fmla z26.s, p3/M, z8.s, z11.s\n"
+    "fmla z25.s, p3/M, z7.s, z13.s\n"
+    "fmla z24.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x10, x28, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "fmla z27.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x8, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "fmla z30.s, p3/M, z5.s, z11.s\n"
+    "fmla z26.s, p3/M, z1.s, z11.s\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x16, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z13.s\n"
+    "fmla z23.s, p3/M, z7.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x11, x28, LSL #2]\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12]\n"
+    "fmla z31.s, p3/M, z2.s, z11.s\n"
+    "fmla z30.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+    "fmla z27.s, p3/M, z8.s, z13.s\n"
+    "fmla z26.s, p3/M, z7.s, z13.s\n"
+    "fmla z24.s, p3/M, z5.s, z13.s\n"
+    "fmla z23.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z12.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "fmla z25.s, p3/M, z0.s, z12.s\n"
+    "fmla z29.s, p3/M, z8.s, z11.s\n"
+    "fmla z26.s, p3/M, z5.s, z11.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "fmla z25.s, p3/M, z8.s, z13.s\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "st1w { z31.s }, p0, [x14]\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z30.s }, p0, [x14, x15, LSL #2]\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "fmax z27.s, p3/M, z27.s, z18.s\n"
+    "st1w { z29.s }, p0, [x14, x24, LSL #2]\n"
+    "fmax z26.s, p3/M, z26.s, z18.s\n"
+    "fmax z25.s, p3/M, z25.s, z18.s\n"
+    "fmax z24.s, p3/M, z24.s, z18.s\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z28.s }, p0, [x23]\n"
+    "fmin z27.s, p3/M, z27.s, z17.s\n"
+    "fmin z26.s, p3/M, z26.s, z17.s\n"
+    "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+    "fmin z25.s, p3/M, z25.s, z17.s\n"
+    "fmin z24.s, p3/M, z24.s, z17.s\n"
+    "st1w { z26.s }, p0, [x23, x24, LSL #2]\n"
+    "fmax z23.s, p3/M, z23.s, z18.s\n"
+    "st1w { z25.s }, p0, [x22]\n"
+    "fmin z23.s, p3/M, z23.s, z17.s\n"
+    "st1w { z24.s }, p0, [x22, x15, LSL #2]\n"
+    "st1w { z23.s }, p0, [x22, x24, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..7c6fb30
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[4];
+      inptrs[3] = input_ptrs[20];
+      inptrs[4] = input_ptrs[7];
+      inptrs[5] = input_ptrs[24];
+      inptrs[6] = input_ptrs[11];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[3];
+      inptrs[9] = input_ptrs[13];
+      inptrs[10] = input_ptrs[5];
+      inptrs[11] = input_ptrs[9];
+      inptrs[12] = input_ptrs[15];
+      inptrs[13] = input_ptrs[17];
+      inptrs[14] = input_ptrs[19];
+      inptrs[15] = input_ptrs[21];
+      inptrs[16] = input_ptrs[6];
+      inptrs[17] = input_ptrs[8];
+      inptrs[18] = input_ptrs[23];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[18];
+      inptrs[22] = input_ptrs[10];
+      inptrs[23] = input_ptrs[14];
+      inptrs[24] = input_ptrs[22];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cntb x14, ALL, MUL #2\n"
+    "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x13, #0x0\n"
+    "ld1w { z16.s }, p3/Z, [x16]\n"
+    "mov z31.d, z16.d\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "cntw x12\n"
+    "mov z30.d, z16.d\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "sub x11, XZR, x12\n"
+    "mov z29.d, z16.d\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "mov z28.d, z16.d\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "cmp x12, %x[n_channels]\n"
+    "mov z27.d, z16.d\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "mov z26.d, z16.d\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "mov z25.d, z16.d\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "mov z24.d, z16.d\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "mov z23.d, z16.d\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "ldp x10, x22, [x15, #0x0]\n"
+    "ldp x9, x28, [x15, #0x10]\n"
+    "ldr x24, [x15, #0x20]\n"
+    "ld1w { z9.s }, p2/Z, [x10, x13, LSL #2]\n"
+    "prfm pldl1keep, [x10, x14]\n"
+    "ld1w { z10.s }, p2/Z, [x22, x13, LSL #2]\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "ld1w { z11.s }, p2/Z, [x9, x13, LSL #2]\n"
+    "prfm pldl1keep, [x9, x14]\n"
+    "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n"
+    "prfm pldl1keep, [x28, x14]\n"
+    "ld1w { z13.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x27, [x15, #0x28]\n"
+    "whilelt p1.s, x12, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z7.s, z9.s\n"
+    "ldr x23, [x15, #0x30]\n"
+    "incw x11\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x26, [x15, #0x38]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z5.s, z9.s\n"
+    "prfm pldl1keep, [x27, x14]\n"
+    "fmla z27.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z25.s, p3/M, z2.s, z9.s\n"
+    "ldr x25, [x15, #0x40]\n"
+    "fmla z24.s, p3/M, z1.s, z9.s\n"
+    "ldr x19, [x15, #0x48]\n"
+    "fmla z23.s, p3/M, z0.s, z9.s\n"
+    "ldr x24, [x15, #0x50]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z25.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z5.s, z13.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z3.s, z13.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "fmla z28.s, p3/M, z2.s, z13.s\n"
+    "ldr x23, [x15, #0x58]\n"
+    "fmla z27.s, p3/M, z1.s, z13.s\n"
+    "ldr x22, [x15, #0x60]\n"
+    "fmla z26.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z23.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla z30.s, p3/M, z6.s, z11.s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla z25.s, p3/M, z1.s, z11.s\n"
+    "ldr x19, [x15, #0x78]\n"
+    "fmla z24.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z27.s, p3/M, z5.s, z10.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z26.s, p3/M, z4.s, z10.s\n"
+    "ldr x10, [x15, #0x80]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z7.s, z10.s\n"
+    "ldr x22, [x15, #0x88]\n"
+    "fmla z24.s, p3/M, z2.s, z10.s\n"
+    "prfm pldl1keep, [x10, x14]\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ldr x9, [x15, #0x90]\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla z28.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z13.s\n"
+    "prfm pldl1keep, [x9, x14]\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z25.s, p3/M, z3.s, z12.s\n"
+    "ldr x28, [x15, #0x98]\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n"
+    "fmla z27.s, p3/M, z7.s, z10.s\n"
+    "ldr x24, [x15, #0xa0]\n"
+    "fmla z26.s, p3/M, z6.s, z10.s\n"
+    "prfm pldl1keep, [x28, x14]\n"
+    "fmla z25.s, p3/M, z5.s, z10.s\n"
+    "ldr x27, [x15, #0xa8]\n"
+    "fmla z28.s, p3/M, z8.s, z10.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "fmla z24.s, p3/M, z4.s, z10.s\n"
+    "ldr x23, [x15, #0xb0]\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "prfm pldl1keep, [x27, x14]\n"
+    "fmla z26.s, p3/M, z8.s, z11.s\n"
+    "ldr x26, [x15, #0xb8]\n"
+    "fmla z25.s, p3/M, z7.s, z13.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla z24.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x9, x13, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "ldr x25, [x15, #0xc0]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "ldp x10, x22, [x15, #0x0]\n"
+    "fmla z27.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z30.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z9.s }, p1/Z, [x10, x12, LSL #2]\n"
+    "fmla z26.s, p3/M, z1.s, z11.s\n"
+    "prfm pldl1keep, [x10, x14]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z13.s\n"
+    "ld1w { z10.s }, p1/Z, [x22, x12, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "ldp x9, x28, [x15, #0x10]\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "ldr x24, [x15, #0x20]\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z11.s\n"
+    "prfm pldl1keep, [x9, x14]\n"
+    "fmla z30.s, p3/M, z1.s, z11.s\n"
+    "prfm pldl1keep, [x28, x14]\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z27.s, p3/M, z8.s, z13.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "fmla z26.s, p3/M, z7.s, z13.s\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla z24.s, p3/M, z5.s, z13.s\n"
+    "ldr x21, [x17, #0x8]\n"
+    "fmla z23.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "incw x13\n"
+    "fmla z31.s, p3/M, z6.s, z12.s\n"
+    "ldr x20, [x17, #0x10]\n"
+    "whilelt p2.s, x13, %x[n_channels]\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla z25.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p1/Z, [x28, x12, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z16.s }, p3/Z, [x16]\n"
+    "fmla z26.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x9, x12, LSL #2]\n"
+    "fmla z25.s, p3/M, z8.s, z13.s\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z13.s }, p1/Z, [x24, x12, LSL #2]\n"
+    "incw x12\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "cmp x12, %x[n_channels]\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "fmax z27.s, p3/M, z27.s, z18.s\n"
+    "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "st1w { z31.s }, p0, [x22, x11, LSL #2]\n"
+    "mov z31.d, z16.d\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
+    "mov z30.d, z16.d\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z29.s }, p0, [x20, x11, LSL #2]\n"
+    "mov z29.d, z16.d\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmin z27.s, p3/M, z27.s, z17.s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmax z26.s, p3/M, z26.s, z18.s\n"
+    "st1w { z28.s }, p0, [x19, x11, LSL #2]\n"
+    "mov z28.d, z16.d\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmax z25.s, p3/M, z25.s, z18.s\n"
+    "st1w { z27.s }, p0, [x22, x11, LSL #2]\n"
+    "mov z27.d, z16.d\n"
+    "ldr x22, [x17, #0x40]\n"
+    "fmin z26.s, p3/M, z26.s, z17.s\n"
+    "st1w { z26.s }, p0, [x21, x11, LSL #2]\n"
+    "mov z26.d, z16.d\n"
+    "fmin z25.s, p3/M, z25.s, z17.s\n"
+    "st1w { z25.s }, p0, [x20, x11, LSL #2]\n"
+    "mov z25.d, z16.d\n"
+    "fmax z24.s, p3/M, z24.s, z18.s\n"
+    "fmax z23.s, p3/M, z23.s, z18.s\n"
+    "fmin z24.s, p3/M, z24.s, z17.s\n"
+    "st1w { z24.s }, p0, [x19, x11, LSL #2]\n"
+    "mov z24.d, z16.d\n"
+    "fmin z23.s, p3/M, z23.s, z17.s\n"
+    "st1w { z23.s }, p0, [x22, x11, LSL #2]\n"
+    "mov z23.d, z16.d\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x27, [x15, #0x28]\n"
+    "incw x11\n"
+    "fmla z30.s, p3/M, z7.s, z9.s\n"
+    "ldr x23, [x15, #0x30]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x26, [x15, #0x38]\n"
+    "fmla z28.s, p3/M, z5.s, z9.s\n"
+    "prfm pldl1keep, [x27, x14]\n"
+    "fmla z27.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla z26.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z25.s, p3/M, z2.s, z9.s\n"
+    "ldr x25, [x15, #0x40]\n"
+    "fmla z24.s, p3/M, z1.s, z9.s\n"
+    "ldr x19, [x15, #0x48]\n"
+    "fmla z23.s, p3/M, z0.s, z9.s\n"
+    "ldr x24, [x15, #0x50]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z25.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z5.s, z13.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z3.s, z13.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "fmla z28.s, p3/M, z2.s, z13.s\n"
+    "ldr x23, [x15, #0x58]\n"
+    "fmla z27.s, p3/M, z1.s, z13.s\n"
+    "ldr x22, [x15, #0x60]\n"
+    "fmla z26.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z23.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla z30.s, p3/M, z6.s, z11.s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "ldr x21, [x15, #0x68]\n"
+    "fmla z27.s, p3/M, z3.s, z11.s\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla z25.s, p3/M, z1.s, z11.s\n"
+    "ldr x19, [x15, #0x78]\n"
+    "fmla z24.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z27.s, p3/M, z5.s, z10.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z26.s, p3/M, z4.s, z10.s\n"
+    "ldr x10, [x15, #0x80]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z7.s, z10.s\n"
+    "ldr x22, [x15, #0x88]\n"
+    "fmla z24.s, p3/M, z2.s, z10.s\n"
+    "prfm pldl1keep, [x10, x14]\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ldr x9, [x15, #0x90]\n"
+    "fmla z30.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla z28.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z13.s\n"
+    "prfm pldl1keep, [x9, x14]\n"
+    "fmla z26.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z25.s, p3/M, z3.s, z12.s\n"
+    "ldr x28, [x15, #0x98]\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n"
+    "fmla z27.s, p3/M, z7.s, z10.s\n"
+    "ldr x24, [x15, #0xa0]\n"
+    "fmla z26.s, p3/M, z6.s, z10.s\n"
+    "prfm pldl1keep, [x28, x14]\n"
+    "fmla z25.s, p3/M, z5.s, z10.s\n"
+    "ldr x27, [x15, #0xa8]\n"
+    "fmla z28.s, p3/M, z8.s, z10.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "fmla z24.s, p3/M, z4.s, z10.s\n"
+    "ldr x23, [x15, #0xb0]\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "prfm pldl1keep, [x27, x14]\n"
+    "fmla z26.s, p3/M, z8.s, z11.s\n"
+    "ldr x26, [x15, #0xb8]\n"
+    "fmla z25.s, p3/M, z7.s, z13.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla z24.s, p3/M, z6.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x9, x13, LSL #2]\n"
+    "fmla z23.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "ldr x25, [x15, #0xc0]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "ldr x22, [x17, #0x0]\n"
+    "fmla z27.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "fmla z30.s, p3/M, z5.s, z11.s\n"
+    "ldr x21, [x17, #0x8]\n"
+    "fmla z26.s, p3/M, z1.s, z11.s\n"
+    "ldr x20, [x17, #0x10]\n"
+    "fmla z27.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z13.s\n"
+    "ldr x19, [x17, #0x18]\n"
+    "fmla z23.s, p3/M, z7.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmla z27.s, p3/M, z6.s, z12.s\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z11.s\n"
+    "fmla z30.s, p3/M, z1.s, z11.s\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z27.s, p3/M, z8.s, z13.s\n"
+    "fmla z26.s, p3/M, z7.s, z13.s\n"
+    "fmla z24.s, p3/M, z5.s, z13.s\n"
+    "fmla z23.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z12.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "fmla z25.s, p3/M, z0.s, z12.s\n"
+    "fmla z29.s, p3/M, z8.s, z11.s\n"
+    "fmla z26.s, p3/M, z5.s, z11.s\n"
+    "fmla z23.s, p3/M, z2.s, z11.s\n"
+    "fmla z25.s, p3/M, z8.s, z13.s\n"
+    "fmla z24.s, p3/M, z7.s, z13.s\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "fmla z23.s, p3/M, z6.s, z13.s\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "st1w { z31.s }, p0, [x22, x11, LSL #2]\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "ldr x22, [x17, #0x20]\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
+    "fmax z27.s, p3/M, z27.s, z18.s\n"
+    "fmax z26.s, p3/M, z26.s, z18.s\n"
+    "st1w { z29.s }, p0, [x20, x11, LSL #2]\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "ldr x21, [x17, #0x28]\n"
+    "fmax z25.s, p3/M, z25.s, z18.s\n"
+    "ldr x20, [x17, #0x30]\n"
+    "fmax z24.s, p3/M, z24.s, z18.s\n"
+    "st1w { z28.s }, p0, [x19, x11, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z17.s\n"
+    "fmin z26.s, p3/M, z26.s, z17.s\n"
+    "ldr x19, [x17, #0x38]\n"
+    "fmin z25.s, p3/M, z25.s, z17.s\n"
+    "st1w { z27.s }, p0, [x22, x11, LSL #2]\n"
+    "fmin z24.s, p3/M, z24.s, z17.s\n"
+    "fmax z23.s, p3/M, z23.s, z18.s\n"
+    "st1w { z26.s }, p0, [x21, x11, LSL #2]\n"
+    "st1w { z25.s }, p0, [x20, x11, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z17.s\n"
+    "st1w { z24.s }, p0, [x19, x11, LSL #2]\n"
+    "ldr x22, [x17, #0x40]\n"
+    "st1w { z23.s }, p0, [x22, x11, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000..a9823e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+  sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..4c24ad9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x2, #0x0\n"
+    "mov x3, #0x0\n"
+    "1:"  // Tile loop
+    "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x24, #0x4\n"
+    "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x23, #0x4\n"
+    "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mov x5, #0x0\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cntw x6\n"
+    "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "sub x21, XZR, x6\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x2, x22\n" // offset = tile_i * ld_input_row
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x3, x7, x19\n" // offset += tile_j * ld_input_col
+    "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x24\n" // offset *= kernel_stride * output_size
+    "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x15, x8, x22, LSL #2\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x14, x15, x22, LSL #2\n"
+    "ld1w { z13.s }, p3/Z, [x4]\n"
+    "mov z31.d, z13.d\n"
+    "ld1w { z0.s }, p3/Z, [x4, #1, MUL VL]\n"
+    "add x13, x14, x22, LSL #2\n"
+    "mov z30.d, z13.d\n"
+    "ld1w { z1.s }, p3/Z, [x4, #2, MUL VL]\n"
+    "add x12, x13, x22, LSL #2\n"
+    "mov z29.d, z13.d\n"
+    "ld1w { z2.s }, p3/Z, [x4, #3, MUL VL]\n"
+    "add x11, x12, x22, LSL #2\n"
+    "mov z28.d, z13.d\n"
+    "ld1w { z3.s }, p3/Z, [x4, #4, MUL VL]\n"
+    "add x10, x7, x7\n"
+    "mov z27.d, z13.d\n"
+    "ld1w { z4.s }, p3/Z, [x4, #5, MUL VL]\n"
+    "add x9, x10, x7\n"
+    "mov z26.d, z13.d\n"
+    "ld1w { z5.s }, p3/Z, [x4, #6, MUL VL]\n"
+    "add x28, x9, x7\n"
+    "mov z25.d, z13.d\n"
+    "ld1w { z6.s }, p3/Z, [x4, #7, MUL VL]\n"
+    "add x27, x28, x7\n"
+    "mov z24.d, z13.d\n"
+    "mul x19, x2, x20\n" // offset = tile_i * ld_output_row
+    "mov z23.d, z13.d\n"
+    "madd x19, x3, x17, x19\n" // offset += tile_j * ld_output_col
+    "mov z22.d, z13.d\n"
+    "mul x19, x19, x23\n" // offset *= output_tile_size
+    "mov z21.d, z13.d\n"
+    "add x16, x16, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "mov z20.d, z13.d\n"
+    "add x26, x16, x20, LSL #2\n"
+    "mov z19.d, z13.d\n"
+    "add x25, x26, x20, LSL #2\n"
+    "mov z18.d, z13.d\n"
+    "add x24, x25, x20, LSL #2\n"
+    "mov z17.d, z13.d\n"
+    "add x23, x17, x17\n"
+    "mov z16.d, z13.d\n"
+    "add x22, x23, x17\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z9.s }, p2/Z, [x14, x10, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x8]\n"
+    "addvl x4, x4, #16\n"
+    "ld1w { z11.s }, p2/Z, [x8, x27, LSL #2]\n"
+    "cmp x6, %x[n_channels]\n"
+    "ld1w { z7.s }, p3/Z, [x4, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x4, #-7, MUL VL]\n"
+    "addvl x4, x4, #-6\n"
+    "ld1w { z12.s }, p2/Z, [x14, x9, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ld1w { z13.s }, p3/Z, [x4]\n"
+    "whilelt p1.s, x6, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z7.s, z9.s\n"
+    "incw x21\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.s, p3/M, z5.s, z9.s\n"
+    "incw x5\n"
+    "fmla z26.s, p3/M, z4.s, z9.s\n"
+    "incw x6\n"
+    "fmla z25.s, p3/M, z3.s, z9.s\n"
+    "fmla z23.s, p3/M, z2.s, z9.s\n"
+    "fmla z22.s, p3/M, z1.s, z9.s\n"
+    "fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x11]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z12.s\n"
+    "fmla z29.s, p3/M, z7.s, z12.s\n"
+    "fmla z26.s, p3/M, z5.s, z12.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "fmla z20.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n"
+    "fmla z19.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
+    "fmla z16.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x8, x28, LSL #2]\n"
+    "fmla z27.s, p3/M, z8.s, z9.s\n"
+    "fmla z26.s, p3/M, z7.s, z9.s\n"
+    "fmla z25.s, p3/M, z6.s, z9.s\n"
+    "fmla z23.s, p3/M, z5.s, z9.s\n"
+    "fmla z22.s, p3/M, z4.s, z9.s\n"
+    "fmla z21.s, p3/M, z3.s, z9.s\n"
+    "fmla z19.s, p3/M, z2.s, z9.s\n"
+    "fmla z18.s, p3/M, z1.s, z9.s\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x15]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x15, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12]\n"
+    "fmla z26.s, p3/M, z8.s, z10.s\n"
+    "fmla z25.s, p3/M, z7.s, z10.s\n"
+    "fmla z24.s, p3/M, z6.s, z10.s\n"
+    "fmla z22.s, p3/M, z5.s, z10.s\n"
+    "fmla z21.s, p3/M, z4.s, z10.s\n"
+    "fmla z20.s, p3/M, z3.s, z10.s\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "fmla z17.s, p3/M, z1.s, z10.s\n"
+    "fmla z16.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "fmla z27.s, p3/M, z0.s, z9.s\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "fmla z24.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
+    "fmla z23.s, p3/M, z6.s, z11.s\n"
+    "fmla z19.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmla z29.s, p3/M, z3.s, z10.s\n"
+    "fmla z27.s, p3/M, z2.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z10.s\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x7, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z11.s\n"
+    "fmla z16.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x7, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "fmla z25.s, p3/M, z1.s, z12.s\n"
+    "fmla z24.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
+    "fmla z19.s, p3/M, z7.s, z11.s\n"
+    "fmla z18.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x28, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "fmla z30.s, p3/M, z6.s, z10.s\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "fmla z26.s, p3/M, z3.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "fmla z22.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x8, x10, LSL #2]\n"
+    "fmla z17.s, p3/M, z8.s, z11.s\n"
+    "fmla z16.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmla z25.s, p3/M, z5.s, z12.s\n"
+    "fmla z24.s, p3/M, z4.s, z12.s\n"
+    "fmla z21.s, p3/M, z2.s, z12.s\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x8, x9, LSL #2]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "fmla z30.s, p3/M, z1.s, z10.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x14]\n"
+    "fmla z27.s, p3/M, z7.s, z11.s\n"
+    "fmla z26.s, p3/M, z6.s, z11.s\n"
+    "fmla z23.s, p3/M, z4.s, z11.s\n"
+    "fmla z22.s, p3/M, z3.s, z11.s\n"
+    "fmla z19.s, p3/M, z1.s, z11.s\n"
+    "fmla z18.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z31.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z9.s }, p1/Z, [x14, x10, LSL #2]\n"
+    "fmla z27.s, p3/M, z3.s, z10.s\n"
+    "fmla z23.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x13]\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "fmla z24.s, p3/M, z7.s, z11.s\n"
+    "fmla z21.s, p3/M, z5.s, z11.s\n"
+    "fmla z20.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "fmla z16.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z12.s\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z20.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z27.s, p3/M, z6.s, z10.s\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "fmla z19.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x11, x10, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z11.s\n"
+    "fmla z21.s, p3/M, z6.s, z11.s\n"
+    "fmla z23.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z5.s, z11.s\n"
+    "fmla z18.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z12.s\n"
+    "fmla z20.s, p3/M, z5.s, z12.s\n"
+    "fmla z16.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z19.s, p3/M, z8.s, z10.s\n"
+    "fmla z18.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x15, x7, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z11.s\n"
+    "fmla z21.s, p3/M, z7.s, z11.s\n"
+    "fmla z20.s, p3/M, z6.s, z11.s\n"
+    "fmla z18.s, p3/M, z5.s, z11.s\n"
+    "fmla z17.s, p3/M, z4.s, z11.s\n"
+    "fmla z16.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x15, x28, LSL #2]\n"
+    "addvl x15, x15, #1\n"
+    "fmla z18.s, p3/M, z8.s, z12.s\n"
+    "fmla z31.s, p3/M, z4.s, z10.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z16.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z10.s\n"
+    "fmla z27.s, p3/M, z1.s, z10.s\n"
+    "fmla z26.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
+    "whilelt p2.s, x5, %x[n_channels]\n"
+    "fmla z29.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z0.s }, p3/Z, [x4, #1, MUL VL]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "cmp x6, %x[n_channels]\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z2.s }, p3/Z, [x4, #3, MUL VL]\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z12.s\n"
+    "ld1w { z1.s }, p3/Z, [x4, #2, MUL VL]\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z6.s }, p3/Z, [x4, #7, MUL VL]\n"
+    "fmla z19.s, p3/M, z4.s, z12.s\n"
+    "fmla z18.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p1/Z, [x14, x9, LSL #2]\n"
+    "fmla z21.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z3.s }, p3/Z, [x4, #4, MUL VL]\n"
+    "fmla z20.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z5.s }, p3/Z, [x4, #6, MUL VL]\n"
+    "fmla z16.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z10.s }, p1/Z, [x8]\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "ld1w { z4.s }, p3/Z, [x4, #5, MUL VL]\n"
+    "addvl x4, x4, #16\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "ld1w { z7.s }, p3/Z, [x4, #-8, MUL VL]\n"
+    "fmax z29.s, p3/M, z29.s, z15.s\n"
+    "ld1w { z8.s }, p3/Z, [x4, #-7, MUL VL]\n"
+    "addvl x4, x4, #-6\n"
+    "fmin z31.s, p3/M, z31.s, z14.s\n"
+    "st1w { z31.s }, p0, [x16]\n"
+    "mov z31.d, z13.d\n"
+    "fmin z30.s, p3/M, z30.s, z14.s\n"
+    "st1w { z30.s }, p0, [x16, x17, LSL #2]\n"
+    "mov z30.d, z13.d\n"
+    "fmin z29.s, p3/M, z29.s, z14.s\n"
+    "st1w { z29.s }, p0, [x16, x23, LSL #2]\n"
+    "mov z29.d, z13.d\n"
+    "fmax z28.s, p3/M, z28.s, z15.s\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "fmax z25.s, p3/M, z25.s, z15.s\n"
+    "fmin z28.s, p3/M, z28.s, z14.s\n"
+    "st1w { z28.s }, p0, [x16, x22, LSL #2]\n"
+    "mov z28.d, z13.d\n"
+    "addvl x16, x16, #1\n"
+    "fmin z27.s, p3/M, z27.s, z14.s\n"
+    "st1w { z27.s }, p0, [x26]\n"
+    "mov z27.d, z13.d\n"
+    "fmin z26.s, p3/M, z26.s, z14.s\n"
+    "st1w { z26.s }, p0, [x26, x17, LSL #2]\n"
+    "mov z26.d, z13.d\n"
+    "fmin z25.s, p3/M, z25.s, z14.s\n"
+    "st1w { z25.s }, p0, [x26, x23, LSL #2]\n"
+    "mov z25.d, z13.d\n"
+    "fmax z24.s, p3/M, z24.s, z15.s\n"
+    "fmax z23.s, p3/M, z23.s, z15.s\n"
+    "fmax z22.s, p3/M, z22.s, z15.s\n"
+    "fmax z21.s, p3/M, z21.s, z15.s\n"
+    "fmin z24.s, p3/M, z24.s, z14.s\n"
+    "st1w { z24.s }, p0, [x26, x22, LSL #2]\n"
+    "mov z24.d, z13.d\n"
+    "addvl x26, x26, #1\n"
+    "fmin z23.s, p3/M, z23.s, z14.s\n"
+    "st1w { z23.s }, p0, [x25]\n"
+    "mov z23.d, z13.d\n"
+    "fmin z22.s, p3/M, z22.s, z14.s\n"
+    "st1w { z22.s }, p0, [x25, x17, LSL #2]\n"
+    "mov z22.d, z13.d\n"
+    "fmin z21.s, p3/M, z21.s, z14.s\n"
+    "st1w { z21.s }, p0, [x25, x23, LSL #2]\n"
+    "mov z21.d, z13.d\n"
+    "fmax z20.s, p3/M, z20.s, z15.s\n"
+    "fmax z19.s, p3/M, z19.s, z15.s\n"
+    "fmax z18.s, p3/M, z18.s, z15.s\n"
+    "fmax z17.s, p3/M, z17.s, z15.s\n"
+    "fmin z20.s, p3/M, z20.s, z14.s\n"
+    "st1w { z20.s }, p0, [x25, x22, LSL #2]\n"
+    "mov z20.d, z13.d\n"
+    "addvl x25, x25, #1\n"
+    "fmin z19.s, p3/M, z19.s, z14.s\n"
+    "st1w { z19.s }, p0, [x24]\n"
+    "mov z19.d, z13.d\n"
+    "fmin z18.s, p3/M, z18.s, z14.s\n"
+    "st1w { z18.s }, p0, [x24, x17, LSL #2]\n"
+    "mov z18.d, z13.d\n"
+    "fmin z17.s, p3/M, z17.s, z14.s\n"
+    "st1w { z17.s }, p0, [x24, x23, LSL #2]\n"
+    "mov z17.d, z13.d\n"
+    "fmax z16.s, p3/M, z16.s, z15.s\n"
+    "fmin z16.s, p3/M, z16.s, z14.s\n"
+    "st1w { z16.s }, p0, [x24, x22, LSL #2]\n"
+    "mov z16.d, z13.d\n"
+    "addvl x24, x24, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z7.s, z9.s\n"
+    "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x21, x2, #0x1\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z27.s, p3/M, z5.s, z9.s\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "add x3, x3, #0x1\n"
+    "fmla z26.s, p3/M, z4.s, z9.s\n"
+    "cmp x3, x19\n"
+    "fmla z25.s, p3/M, z3.s, z9.s\n"
+    "fmla z23.s, p3/M, z2.s, z9.s\n"
+    "csel x3, x3, XZR, LT\n"
+    "fmla z22.s, p3/M, z1.s, z9.s\n"
+    "csel x2, x2, x21, LT\n"
+    "fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n"
+    "cmp x2, x20\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x11]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z12.s\n"
+    "fmla z29.s, p3/M, z7.s, z12.s\n"
+    "fmla z26.s, p3/M, z5.s, z12.s\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "fmla z20.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n"
+    "fmla z19.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n"
+    "fmla z16.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x8, x28, LSL #2]\n"
+    "fmla z27.s, p3/M, z8.s, z9.s\n"
+    "fmla z26.s, p3/M, z7.s, z9.s\n"
+    "fmla z25.s, p3/M, z6.s, z9.s\n"
+    "fmla z23.s, p3/M, z5.s, z9.s\n"
+    "fmla z22.s, p3/M, z4.s, z9.s\n"
+    "fmla z21.s, p3/M, z3.s, z9.s\n"
+    "fmla z19.s, p3/M, z2.s, z9.s\n"
+    "fmla z18.s, p3/M, z1.s, z9.s\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x15]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x15, x27, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12]\n"
+    "fmla z26.s, p3/M, z8.s, z10.s\n"
+    "fmla z25.s, p3/M, z7.s, z10.s\n"
+    "fmla z24.s, p3/M, z6.s, z10.s\n"
+    "fmla z22.s, p3/M, z5.s, z10.s\n"
+    "fmla z21.s, p3/M, z4.s, z10.s\n"
+    "fmla z20.s, p3/M, z3.s, z10.s\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "fmla z17.s, p3/M, z1.s, z10.s\n"
+    "fmla z16.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "fmla z27.s, p3/M, z0.s, z9.s\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "fmla z24.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n"
+    "fmla z23.s, p3/M, z6.s, z11.s\n"
+    "fmla z19.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmla z29.s, p3/M, z3.s, z10.s\n"
+    "fmla z27.s, p3/M, z2.s, z10.s\n"
+    "fmla z26.s, p3/M, z1.s, z10.s\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x7, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z11.s\n"
+    "fmla z16.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x7, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "fmla z25.s, p3/M, z1.s, z12.s\n"
+    "fmla z24.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n"
+    "fmla z19.s, p3/M, z7.s, z11.s\n"
+    "fmla z18.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x28, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "fmla z30.s, p3/M, z6.s, z10.s\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "fmla z26.s, p3/M, z3.s, z10.s\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "fmla z22.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x8, x10, LSL #2]\n"
+    "fmla z17.s, p3/M, z8.s, z11.s\n"
+    "fmla z16.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "fmla z25.s, p3/M, z5.s, z12.s\n"
+    "fmla z24.s, p3/M, z4.s, z12.s\n"
+    "fmla z21.s, p3/M, z2.s, z12.s\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x8, x9, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "fmla z30.s, p3/M, z1.s, z10.s\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x14]\n"
+    "fmla z27.s, p3/M, z7.s, z11.s\n"
+    "fmla z26.s, p3/M, z6.s, z11.s\n"
+    "fmla z23.s, p3/M, z4.s, z11.s\n"
+    "fmla z22.s, p3/M, z3.s, z11.s\n"
+    "fmla z19.s, p3/M, z1.s, z11.s\n"
+    "fmla z18.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z10.s\n"
+    "fmla z27.s, p3/M, z3.s, z10.s\n"
+    "fmla z23.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x13]\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "fmla z24.s, p3/M, z7.s, z11.s\n"
+    "fmla z21.s, p3/M, z5.s, z11.s\n"
+    "fmla z20.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "fmla z16.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z12.s\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z20.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z10.s\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "fmla z19.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x11, x10, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z11.s\n"
+    "fmla z21.s, p3/M, z6.s, z11.s\n"
+    "fmla z23.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z5.s, z11.s\n"
+    "fmla z18.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z12.s\n"
+    "fmla z20.s, p3/M, z5.s, z12.s\n"
+    "fmla z16.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n"
+    "fmla z19.s, p3/M, z8.s, z10.s\n"
+    "fmla z18.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x15, x7, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z11.s\n"
+    "fmla z21.s, p3/M, z7.s, z11.s\n"
+    "fmla z20.s, p3/M, z6.s, z11.s\n"
+    "fmla z18.s, p3/M, z5.s, z11.s\n"
+    "fmla z17.s, p3/M, z4.s, z11.s\n"
+    "fmla z16.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x15, x28, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z10.s\n"
+    "fmla z18.s, p3/M, z8.s, z12.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z16.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z10.s\n"
+    "fmla z27.s, p3/M, z1.s, z10.s\n"
+    "fmla z26.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z11.s\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z7.s, z12.s\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "fmla z19.s, p3/M, z4.s, z12.s\n"
+    "fmla z18.s, p3/M, z3.s, z12.s\n"
+    "fmla z21.s, p3/M, z8.s, z10.s\n"
+    "fmla z20.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z5.s, z10.s\n"
+    "fmla z16.s, p3/M, z4.s, z10.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmax z29.s, p3/M, z29.s, z15.s\n"
+    "fmax z28.s, p3/M, z28.s, z15.s\n"
+    "fmin z31.s, p3/M, z31.s, z14.s\n"
+    "st1w { z31.s }, p0, [x16]\n"
+    "fmin z30.s, p3/M, z30.s, z14.s\n"
+    "fmin z29.s, p3/M, z29.s, z14.s\n"
+    "st1w { z30.s }, p0, [x16, x17, LSL #2]\n"
+    "fmin z28.s, p3/M, z28.s, z14.s\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "st1w { z29.s }, p0, [x16, x23, LSL #2]\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "st1w { z28.s }, p0, [x16, x22, LSL #2]\n"
+    "fmin z27.s, p3/M, z27.s, z14.s\n"
+    "fmax z25.s, p3/M, z25.s, z15.s\n"
+    "st1w { z27.s }, p0, [x26]\n"
+    "fmin z26.s, p3/M, z26.s, z14.s\n"
+    "fmin z25.s, p3/M, z25.s, z14.s\n"
+    "st1w { z26.s }, p0, [x26, x17, LSL #2]\n"
+    "fmax z24.s, p3/M, z24.s, z15.s\n"
+    "fmax z23.s, p3/M, z23.s, z15.s\n"
+    "st1w { z25.s }, p0, [x26, x23, LSL #2]\n"
+    "fmax z22.s, p3/M, z22.s, z15.s\n"
+    "fmax z21.s, p3/M, z21.s, z15.s\n"
+    "fmax z20.s, p3/M, z20.s, z15.s\n"
+    "fmin z24.s, p3/M, z24.s, z14.s\n"
+    "st1w { z24.s }, p0, [x26, x22, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z14.s\n"
+    "fmin z22.s, p3/M, z22.s, z14.s\n"
+    "st1w { z23.s }, p0, [x25]\n"
+    "fmin z21.s, p3/M, z21.s, z14.s\n"
+    "fmin z20.s, p3/M, z20.s, z14.s\n"
+    "st1w { z22.s }, p0, [x25, x17, LSL #2]\n"
+    "fmax z19.s, p3/M, z19.s, z15.s\n"
+    "st1w { z21.s }, p0, [x25, x23, LSL #2]\n"
+    "fmax z18.s, p3/M, z18.s, z15.s\n"
+    "fmax z17.s, p3/M, z17.s, z15.s\n"
+    "st1w { z20.s }, p0, [x25, x22, LSL #2]\n"
+    "fmin z19.s, p3/M, z19.s, z14.s\n"
+    "st1w { z19.s }, p0, [x24]\n"
+    "fmin z18.s, p3/M, z18.s, z14.s\n"
+    "fmin z17.s, p3/M, z17.s, z14.s\n"
+    "st1w { z18.s }, p0, [x24, x17, LSL #2]\n"
+    "fmax z16.s, p3/M, z16.s, z15.s\n"
+    "st1w { z17.s }, p0, [x24, x23, LSL #2]\n"
+    "fmin z16.s, p3/M, z16.s, z14.s\n"
+    "st1w { z16.s }, p0, [x24, x22, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..ac0c4ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[14];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[5];
+      inptrs[3] = input_ptrs[15];
+      inptrs[4] = input_ptrs[30];
+      inptrs[5] = input_ptrs[35];
+      inptrs[6] = input_ptrs[20];
+      inptrs[7] = input_ptrs[1];
+      inptrs[8] = input_ptrs[4];
+      inptrs[9] = input_ptrs[21];
+      inptrs[10] = input_ptrs[6];
+      inptrs[11] = input_ptrs[11];
+      inptrs[12] = input_ptrs[24];
+      inptrs[13] = input_ptrs[8];
+      inptrs[14] = input_ptrs[29];
+      inptrs[15] = input_ptrs[9];
+      inptrs[16] = input_ptrs[31];
+      inptrs[17] = input_ptrs[13];
+      inptrs[18] = input_ptrs[34];
+      inptrs[19] = input_ptrs[16];
+      inptrs[20] = input_ptrs[2];
+      inptrs[21] = input_ptrs[19];
+      inptrs[22] = input_ptrs[3];
+      inptrs[23] = input_ptrs[12];
+      inptrs[24] = input_ptrs[22];
+      inptrs[25] = input_ptrs[17];
+      inptrs[26] = input_ptrs[18];
+      inptrs[27] = input_ptrs[26];
+      inptrs[28] = input_ptrs[23];
+      inptrs[29] = input_ptrs[32];
+      inptrs[30] = input_ptrs[27];
+      inptrs[31] = input_ptrs[33];
+      inptrs[32] = input_ptrs[7];
+      inptrs[33] = input_ptrs[10];
+      inptrs[34] = input_ptrs[25];
+      inptrs[35] = input_ptrs[28];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x5, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x7, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cntb x8, ALL, MUL #2\n"
+    "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x17, #0x0\n"
+    "ld1w { z13.s }, p3/Z, [x6]\n"
+    "mov z31.d, z13.d\n"
+    "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n"
+    "cntw x16\n"
+    "mov z30.d, z13.d\n"
+    "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n"
+    "sub x15, XZR, x16\n"
+    "mov z29.d, z13.d\n"
+    "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "mov z28.d, z13.d\n"
+    "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "mov z27.d, z13.d\n"
+    "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n"
+    "mov z26.d, z13.d\n"
+    "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n"
+    "mov z25.d, z13.d\n"
+    "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n"
+    "addvl x6, x6, #16\n"
+    "mov z24.d, z13.d\n"
+    "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n"
+    "mov z23.d, z13.d\n"
+    "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n"
+    "addvl x6, x6, #-6\n"
+    "mov z22.d, z13.d\n"
+    "ldp x14, x13, [x7, #0x0]\n"
+    "mov z21.d, z13.d\n"
+    "ldp x12, x11, [x7, #0x10]\n"
+    "mov z20.d, z13.d\n"
+    "ld1w { z9.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "mov z19.d, z13.d\n"
+    "mov z18.d, z13.d\n"
+    "prfm pldl1keep, [x14, x8]\n"
+    "mov z17.d, z13.d\n"
+    "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n"
+    "mov z16.d, z13.d\n"
+    "prfm pldl1keep, [x13, x8]\n"
+    "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n"
+    "prfm pldl1keep, [x12, x8]\n"
+    "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n"
+    "prfm pldl1keep, [x11, x8]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x22, [x7, #0x20]\n"
+    "whilelt p1.s, x16, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z7.s, z9.s\n"
+    "ldr x21, [x7, #0x28]\n"
+    "incw x15\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x20, [x7, #0x30]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z27.s, p3/M, z5.s, z9.s\n"
+    "prfm pldl1keep, [x22, x8]\n"
+    "fmla z26.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x21, x8]\n"
+    "fmla z25.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x20, x8]\n"
+    "fmla z23.s, p3/M, z2.s, z9.s\n"
+    "ldr x19, [x7, #0x38]\n"
+    "fmla z22.s, p3/M, z1.s, z9.s\n"
+    "ldr x10, [x7, #0x40]\n"
+    "fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z12.s\n"
+    "prfm pldl1keep, [x19, x8]\n"
+    "fmla z29.s, p3/M, z7.s, z12.s\n"
+    "prfm pldl1keep, [x10, x8]\n"
+    "fmla z26.s, p3/M, z5.s, z12.s\n"
+    "ldr x9, [x7, #0x48]\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ldr x28, [x7, #0x50]\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "ldr x27, [x7, #0x58]\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x9, x8]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "prfm pldl1keep, [x28, x8]\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x27, x8]\n"
+    "fmla z20.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x19, x17, LSL #2]\n"
+    "fmla z19.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "fmla z16.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n"
+    "fmla z27.s, p3/M, z8.s, z9.s\n"
+    "ldr x26, [x7, #0x60]\n"
+    "fmla z26.s, p3/M, z7.s, z9.s\n"
+    "ldr x25, [x7, #0x68]\n"
+    "fmla z25.s, p3/M, z6.s, z9.s\n"
+    "ldr x24, [x7, #0x70]\n"
+    "fmla z23.s, p3/M, z5.s, z9.s\n"
+    "prfm pldl1keep, [x26, x8]\n"
+    "fmla z22.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x25, x8]\n"
+    "fmla z21.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x24, x8]\n"
+    "fmla z19.s, p3/M, z2.s, z9.s\n"
+    "ldr x23, [x7, #0x78]\n"
+    "fmla z18.s, p3/M, z1.s, z9.s\n"
+    "ldr x14, [x7, #0x80]\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x28, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x23, x8]\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x17, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "prfm pldl1keep, [x14, x8]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x17, LSL #2]\n"
+    "fmla z26.s, p3/M, z8.s, z10.s\n"
+    "ldr x13, [x7, #0x88]\n"
+    "fmla z25.s, p3/M, z7.s, z10.s\n"
+    "ldr x12, [x7, #0x90]\n"
+    "fmla z24.s, p3/M, z6.s, z10.s\n"
+    "ldr x11, [x7, #0x98]\n"
+    "fmla z22.s, p3/M, z5.s, z10.s\n"
+    "prfm pldl1keep, [x13, x8]\n"
+    "fmla z21.s, p3/M, z4.s, z10.s\n"
+    "prfm pldl1keep, [x12, x8]\n"
+    "fmla z20.s, p3/M, z3.s, z10.s\n"
+    "prfm pldl1keep, [x11, x8]\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "ldr x22, [x7, #0xa0]\n"
+    "fmla z17.s, p3/M, z1.s, z10.s\n"
+    "ldr x21, [x7, #0xa8]\n"
+    "fmla z16.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x22, x8]\n"
+    "fmla z27.s, p3/M, z0.s, z9.s\n"
+    "prfm pldl1keep, [x21, x8]\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "ldr x20, [x7, #0xb0]\n"
+    "fmla z24.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+    "fmla z23.s, p3/M, z6.s, z11.s\n"
+    "ldr x19, [x7, #0xb8]\n"
+    "fmla z19.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "prfm pldl1keep, [x20, x8]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "prfm pldl1keep, [x19, x8]\n"
+    "fmla z29.s, p3/M, z3.s, z10.s\n"
+    "ldr x10, [x7, #0xc0]\n"
+    "fmla z27.s, p3/M, z2.s, z10.s\n"
+    "ldr x9, [x7, #0xc8]\n"
+    "fmla z26.s, p3/M, z1.s, z10.s\n"
+    "ldr x28, [x7, #0xd0]\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z11.s\n"
+    "prfm pldl1keep, [x10, x8]\n"
+    "fmla z16.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "prfm pldl1keep, [x9, x8]\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "prfm pldl1keep, [x28, x8]\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "ldr x27, [x7, #0xd8]\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "ldr x26, [x7, #0xe0]\n"
+    "fmla z25.s, p3/M, z1.s, z12.s\n"
+    "ldr x25, [x7, #0xe8]\n"
+    "fmla z24.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n"
+    "fmla z19.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x27, x8]\n"
+    "fmla z18.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "prfm pldl1keep, [x26, x8]\n"
+    "fmla z30.s, p3/M, z6.s, z10.s\n"
+    "prfm pldl1keep, [x25, x8]\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "ldr x24, [x7, #0xf0]\n"
+    "fmla z26.s, p3/M, z3.s, z10.s\n"
+    "ldr x23, [x7, #0xf8]\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ldr x14, [x7, #0x100]\n"
+    "fmla z22.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n"
+    "fmla z17.s, p3/M, z8.s, z11.s\n"
+    "prfm pldl1keep, [x24, x8]\n"
+    "fmla z16.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "prfm pldl1keep, [x23, x8]\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "prfm pldl1keep, [x14, x8]\n"
+    "fmla z25.s, p3/M, z5.s, z12.s\n"
+    "ldr x13, [x7, #0x108]\n"
+    "fmla z24.s, p3/M, z4.s, z12.s\n"
+    "ldr x12, [x7, #0x110]\n"
+    "fmla z21.s, p3/M, z2.s, z12.s\n"
+    "ldr x11, [x7, #0x118]\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "prfm pldl1keep, [x13, x8]\n"
+    "fmla z30.s, p3/M, z1.s, z10.s\n"
+    "prfm pldl1keep, [x12, x8]\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x19, x17, LSL #2]\n"
+    "fmla z27.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x11, x8]\n"
+    "addvl x8, x8, #1\n"
+    "fmla z26.s, p3/M, z6.s, z11.s\n"
+    "ldr x22, [x5, #0x0]\n"
+    "fmla z23.s, p3/M, z4.s, z11.s\n"
+    "ldr x21, [x5, #0x8]\n"
+    "fmla z22.s, p3/M, z3.s, z11.s\n"
+    "ldr x20, [x5, #0x10]\n"
+    "fmla z19.s, p3/M, z1.s, z11.s\n"
+    "ldr x19, [x5, #0x18]\n"
+    "fmla z18.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z13.s }, p3/Z, [x6]\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z10.s\n"
+    "fmla z27.s, p3/M, z3.s, z10.s\n"
+    "fmla z23.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x17, LSL #2]\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "fmla z24.s, p3/M, z7.s, z11.s\n"
+    "fmla z21.s, p3/M, z5.s, z11.s\n"
+    "fmla z20.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "fmla z16.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x27, x17, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z12.s\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z20.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x17, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z10.s\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "fmla z19.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z11.s\n"
+    "fmla z21.s, p3/M, z6.s, z11.s\n"
+    "fmla z23.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z5.s, z11.s\n"
+    "fmla z18.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z12.s\n"
+    "fmla z20.s, p3/M, z5.s, z12.s\n"
+    "fmla z16.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+    "fmla z19.s, p3/M, z8.s, z10.s\n"
+    "fmla z18.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z11.s\n"
+    "fmla z21.s, p3/M, z7.s, z11.s\n"
+    "fmla z20.s, p3/M, z6.s, z11.s\n"
+    "fmla z18.s, p3/M, z5.s, z11.s\n"
+    "fmla z17.s, p3/M, z4.s, z11.s\n"
+    "fmla z16.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z10.s\n"
+    "ldp x14, x13, [x7, #0x0]\n"
+    "fmla z18.s, p3/M, z8.s, z12.s\n"
+    "ld1w { z9.s }, p1/Z, [x14, x16, LSL #2]\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z16.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x17, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z10.s\n"
+    "prfm pldl1keep, [x14, x8]\n"
+    "fmla z27.s, p3/M, z1.s, z10.s\n"
+    "prfm pldl1keep, [x13, x8]\n"
+    "fmla z26.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x11, x17, LSL #2]\n"
+    "incw x17\n"
+    "fmla z29.s, p3/M, z5.s, z11.s\n"
+    "ldp x12, x11, [x7, #0x10]\n"
+    "whilelt p2.s, x17, %x[n_channels]\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x12, x16, LSL #2]\n"
+    "fmla z23.s, p3/M, z7.s, z12.s\n"
+    "prfm pldl1keep, [x12, x8]\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "prfm pldl1keep, [x11, x8]\n"
+    "fmla z19.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n"
+    "fmla z18.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z12.s }, p1/Z, [x11, x16, LSL #2]\n"
+    "fmla z21.s, p3/M, z8.s, z10.s\n"
+    "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n"
+    "fmla z20.s, p3/M, z7.s, z10.s\n"
+    "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n"
+    "fmla z17.s, p3/M, z5.s, z10.s\n"
+    "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n"
+    "fmla z16.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z10.s }, p1/Z, [x13, x16, LSL #2]\n"
+    "incw x16\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n"
+    "addvl x6, x6, #16\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n"
+    "cmp x16, %x[n_channels]\n"
+    "fmax z29.s, p3/M, z29.s, z15.s\n"
+    "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n"
+    "addvl x6, x6, #-6\n"
+    "fmax z28.s, p3/M, z28.s, z15.s\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmin z31.s, p3/M, z31.s, z14.s\n"
+    "st1w { z31.s }, p0, [x22, x15, LSL #2]\n"
+    "mov z31.d, z13.d\n"
+    "fmin z30.s, p3/M, z30.s, z14.s\n"
+    "ldr x22, [x5, #0x20]\n"
+    "fmin z29.s, p3/M, z29.s, z14.s\n"
+    "st1w { z30.s }, p0, [x21, x15, LSL #2]\n"
+    "mov z30.d, z13.d\n"
+    "fmin z28.s, p3/M, z28.s, z14.s\n"
+    "st1w { z29.s }, p0, [x20, x15, LSL #2]\n"
+    "mov z29.d, z13.d\n"
+    "ldr x21, [x5, #0x28]\n"
+    "fmin z27.s, p3/M, z27.s, z14.s\n"
+    "ldr x20, [x5, #0x30]\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "st1w { z28.s }, p0, [x19, x15, LSL #2]\n"
+    "mov z28.d, z13.d\n"
+    "ldr x19, [x5, #0x38]\n"
+    "fmax z25.s, p3/M, z25.s, z15.s\n"
+    "st1w { z27.s }, p0, [x22, x15, LSL #2]\n"
+    "mov z27.d, z13.d\n"
+    "ldr x22, [x5, #0x40]\n"
+    "fmin z26.s, p3/M, z26.s, z14.s\n"
+    "st1w { z26.s }, p0, [x21, x15, LSL #2]\n"
+    "mov z26.d, z13.d\n"
+    "fmin z25.s, p3/M, z25.s, z14.s\n"
+    "ldr x21, [x5, #0x48]\n"
+    "fmax z24.s, p3/M, z24.s, z15.s\n"
+    "st1w { z25.s }, p0, [x20, x15, LSL #2]\n"
+    "mov z25.d, z13.d\n"
+    "fmax z23.s, p3/M, z23.s, z15.s\n"
+    "ldr x20, [x5, #0x50]\n"
+    "fmin z24.s, p3/M, z24.s, z14.s\n"
+    "st1w { z24.s }, p0, [x19, x15, LSL #2]\n"
+    "mov z24.d, z13.d\n"
+    "fmin z23.s, p3/M, z23.s, z14.s\n"
+    "ldr x19, [x5, #0x58]\n"
+    "fmax z22.s, p3/M, z22.s, z15.s\n"
+    "st1w { z23.s }, p0, [x22, x15, LSL #2]\n"
+    "mov z23.d, z13.d\n"
+    "fmax z21.s, p3/M, z21.s, z15.s\n"
+    "ldr x22, [x5, #0x60]\n"
+    "fmin z22.s, p3/M, z22.s, z14.s\n"
+    "st1w { z22.s }, p0, [x21, x15, LSL #2]\n"
+    "mov z22.d, z13.d\n"
+    "fmin z21.s, p3/M, z21.s, z14.s\n"
+    "ldr x21, [x5, #0x68]\n"
+    "fmax z20.s, p3/M, z20.s, z15.s\n"
+    "st1w { z21.s }, p0, [x20, x15, LSL #2]\n"
+    "mov z21.d, z13.d\n"
+    "fmax z19.s, p3/M, z19.s, z15.s\n"
+    "ldr x20, [x5, #0x70]\n"
+    "fmin z20.s, p3/M, z20.s, z14.s\n"
+    "st1w { z20.s }, p0, [x19, x15, LSL #2]\n"
+    "mov z20.d, z13.d\n"
+    "fmin z19.s, p3/M, z19.s, z14.s\n"
+    "ldr x19, [x5, #0x78]\n"
+    "fmax z18.s, p3/M, z18.s, z15.s\n"
+    "st1w { z19.s }, p0, [x22, x15, LSL #2]\n"
+    "mov z19.d, z13.d\n"
+    "fmax z17.s, p3/M, z17.s, z15.s\n"
+    "fmin z18.s, p3/M, z18.s, z14.s\n"
+    "st1w { z18.s }, p0, [x21, x15, LSL #2]\n"
+    "mov z18.d, z13.d\n"
+    "fmin z17.s, p3/M, z17.s, z14.s\n"
+    "st1w { z17.s }, p0, [x20, x15, LSL #2]\n"
+    "mov z17.d, z13.d\n"
+    "fmax z16.s, p3/M, z16.s, z15.s\n"
+    "fmin z16.s, p3/M, z16.s, z14.s\n"
+    "st1w { z16.s }, p0, [x19, x15, LSL #2]\n"
+    "mov z16.d, z13.d\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x22, [x7, #0x20]\n"
+    "incw x15\n"
+    "fmla z30.s, p3/M, z7.s, z9.s\n"
+    "ldr x21, [x7, #0x28]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.s, p3/M, z6.s, z9.s\n"
+    "ldr x20, [x7, #0x30]\n"
+    "fmla z27.s, p3/M, z5.s, z9.s\n"
+    "prfm pldl1keep, [x22, x8]\n"
+    "fmla z26.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x21, x8]\n"
+    "fmla z25.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x20, x8]\n"
+    "fmla z23.s, p3/M, z2.s, z9.s\n"
+    "ldr x19, [x7, #0x38]\n"
+    "fmla z22.s, p3/M, z1.s, z9.s\n"
+    "ldr x10, [x7, #0x40]\n"
+    "fmla z21.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z12.s\n"
+    "prfm pldl1keep, [x19, x8]\n"
+    "fmla z29.s, p3/M, z7.s, z12.s\n"
+    "prfm pldl1keep, [x10, x8]\n"
+    "fmla z26.s, p3/M, z5.s, z12.s\n"
+    "ldr x9, [x7, #0x48]\n"
+    "fmla z28.s, p3/M, z6.s, z12.s\n"
+    "ldr x28, [x7, #0x50]\n"
+    "fmla z25.s, p3/M, z4.s, z12.s\n"
+    "ldr x27, [x7, #0x58]\n"
+    "fmla z24.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x9, x8]\n"
+    "fmla z22.s, p3/M, z2.s, z12.s\n"
+    "prfm pldl1keep, [x28, x8]\n"
+    "fmla z21.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x27, x8]\n"
+    "fmla z20.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x19, x17, LSL #2]\n"
+    "fmla z19.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "fmla z16.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n"
+    "fmla z27.s, p3/M, z8.s, z9.s\n"
+    "ldr x26, [x7, #0x60]\n"
+    "fmla z26.s, p3/M, z7.s, z9.s\n"
+    "ldr x25, [x7, #0x68]\n"
+    "fmla z25.s, p3/M, z6.s, z9.s\n"
+    "ldr x24, [x7, #0x70]\n"
+    "fmla z23.s, p3/M, z5.s, z9.s\n"
+    "prfm pldl1keep, [x26, x8]\n"
+    "fmla z22.s, p3/M, z4.s, z9.s\n"
+    "prfm pldl1keep, [x25, x8]\n"
+    "fmla z21.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x24, x8]\n"
+    "fmla z19.s, p3/M, z2.s, z9.s\n"
+    "ldr x23, [x7, #0x78]\n"
+    "fmla z18.s, p3/M, z1.s, z9.s\n"
+    "ldr x14, [x7, #0x80]\n"
+    "fmla z17.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x28, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x23, x8]\n"
+    "fmla z30.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x17, LSL #2]\n"
+    "fmla z29.s, p3/M, z2.s, z11.s\n"
+    "prfm pldl1keep, [x14, x8]\n"
+    "fmla z28.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x26, x17, LSL #2]\n"
+    "fmla z26.s, p3/M, z8.s, z10.s\n"
+    "ldr x13, [x7, #0x88]\n"
+    "fmla z25.s, p3/M, z7.s, z10.s\n"
+    "ldr x12, [x7, #0x90]\n"
+    "fmla z24.s, p3/M, z6.s, z10.s\n"
+    "ldr x11, [x7, #0x98]\n"
+    "fmla z22.s, p3/M, z5.s, z10.s\n"
+    "prfm pldl1keep, [x13, x8]\n"
+    "fmla z21.s, p3/M, z4.s, z10.s\n"
+    "prfm pldl1keep, [x12, x8]\n"
+    "fmla z20.s, p3/M, z3.s, z10.s\n"
+    "prfm pldl1keep, [x11, x8]\n"
+    "fmla z18.s, p3/M, z2.s, z10.s\n"
+    "ldr x22, [x7, #0xa0]\n"
+    "fmla z17.s, p3/M, z1.s, z10.s\n"
+    "ldr x21, [x7, #0xa8]\n"
+    "fmla z16.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x22, x8]\n"
+    "fmla z27.s, p3/M, z0.s, z9.s\n"
+    "prfm pldl1keep, [x21, x8]\n"
+    "fmla z28.s, p3/M, z5.s, z12.s\n"
+    "ldr x20, [x7, #0xb0]\n"
+    "fmla z24.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+    "fmla z23.s, p3/M, z6.s, z11.s\n"
+    "ldr x19, [x7, #0xb8]\n"
+    "fmla z19.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z5.s, z10.s\n"
+    "prfm pldl1keep, [x20, x8]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "prfm pldl1keep, [x19, x8]\n"
+    "fmla z29.s, p3/M, z3.s, z10.s\n"
+    "ldr x10, [x7, #0xc0]\n"
+    "fmla z27.s, p3/M, z2.s, z10.s\n"
+    "ldr x9, [x7, #0xc8]\n"
+    "fmla z26.s, p3/M, z1.s, z10.s\n"
+    "ldr x28, [x7, #0xd0]\n"
+    "fmla z25.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n"
+    "fmla z20.s, p3/M, z8.s, z11.s\n"
+    "prfm pldl1keep, [x10, x8]\n"
+    "fmla z16.s, p3/M, z5.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "prfm pldl1keep, [x9, x8]\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "prfm pldl1keep, [x28, x8]\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "ldr x27, [x7, #0xd8]\n"
+    "fmla z26.s, p3/M, z2.s, z12.s\n"
+    "ldr x26, [x7, #0xe0]\n"
+    "fmla z25.s, p3/M, z1.s, z12.s\n"
+    "ldr x25, [x7, #0xe8]\n"
+    "fmla z24.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n"
+    "fmla z19.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x27, x8]\n"
+    "fmla z18.s, p3/M, z6.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z7.s, z10.s\n"
+    "prfm pldl1keep, [x26, x8]\n"
+    "fmla z30.s, p3/M, z6.s, z10.s\n"
+    "prfm pldl1keep, [x25, x8]\n"
+    "fmla z27.s, p3/M, z4.s, z10.s\n"
+    "ldr x24, [x7, #0xf0]\n"
+    "fmla z26.s, p3/M, z3.s, z10.s\n"
+    "ldr x23, [x7, #0xf8]\n"
+    "fmla z23.s, p3/M, z1.s, z10.s\n"
+    "ldr x14, [x7, #0x100]\n"
+    "fmla z22.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n"
+    "fmla z17.s, p3/M, z8.s, z11.s\n"
+    "prfm pldl1keep, [x24, x8]\n"
+    "fmla z16.s, p3/M, z7.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z12.s\n"
+    "prfm pldl1keep, [x23, x8]\n"
+    "fmla z28.s, p3/M, z7.s, z12.s\n"
+    "prfm pldl1keep, [x14, x8]\n"
+    "fmla z25.s, p3/M, z5.s, z12.s\n"
+    "ldr x13, [x7, #0x108]\n"
+    "fmla z24.s, p3/M, z4.s, z12.s\n"
+    "ldr x12, [x7, #0x110]\n"
+    "fmla z21.s, p3/M, z2.s, z12.s\n"
+    "ldr x11, [x7, #0x118]\n"
+    "fmla z20.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "prfm pldl1keep, [x13, x8]\n"
+    "fmla z30.s, p3/M, z1.s, z10.s\n"
+    "prfm pldl1keep, [x12, x8]\n"
+    "fmla z29.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x19, x17, LSL #2]\n"
+    "fmla z27.s, p3/M, z7.s, z11.s\n"
+    "prfm pldl1keep, [x11, x8]\n"
+    "fmla z26.s, p3/M, z6.s, z11.s\n"
+    "ldr x22, [x5, #0x0]\n"
+    "fmla z23.s, p3/M, z4.s, z11.s\n"
+    "ldr x21, [x5, #0x8]\n"
+    "fmla z22.s, p3/M, z3.s, z11.s\n"
+    "ldr x20, [x5, #0x10]\n"
+    "fmla z19.s, p3/M, z1.s, z11.s\n"
+    "ldr x19, [x5, #0x18]\n"
+    "fmla z18.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z12.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x9, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z6.s, z10.s\n"
+    "fmla z27.s, p3/M, z3.s, z10.s\n"
+    "fmla z23.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x17, LSL #2]\n"
+    "fmla z25.s, p3/M, z8.s, z11.s\n"
+    "fmla z24.s, p3/M, z7.s, z11.s\n"
+    "fmla z21.s, p3/M, z5.s, z11.s\n"
+    "fmla z20.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z2.s, z11.s\n"
+    "fmla z16.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x27, x17, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z12.s\n"
+    "fmla z24.s, p3/M, z5.s, z12.s\n"
+    "fmla z20.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x26, x17, LSL #2]\n"
+    "fmla z27.s, p3/M, z6.s, z10.s\n"
+    "fmla z23.s, p3/M, z3.s, z10.s\n"
+    "fmla z19.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z7.s, z11.s\n"
+    "fmla z21.s, p3/M, z6.s, z11.s\n"
+    "fmla z23.s, p3/M, z8.s, z11.s\n"
+    "fmla z19.s, p3/M, z5.s, z11.s\n"
+    "fmla z18.s, p3/M, z4.s, z11.s\n"
+    "fmla z17.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n"
+    "fmla z24.s, p3/M, z8.s, z12.s\n"
+    "fmla z20.s, p3/M, z5.s, z12.s\n"
+    "fmla z16.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n"
+    "fmla z19.s, p3/M, z8.s, z10.s\n"
+    "fmla z18.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z6.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x14, x17, LSL #2]\n"
+    "fmla z22.s, p3/M, z8.s, z11.s\n"
+    "fmla z21.s, p3/M, z7.s, z11.s\n"
+    "fmla z20.s, p3/M, z6.s, z11.s\n"
+    "fmla z18.s, p3/M, z5.s, z11.s\n"
+    "fmla z17.s, p3/M, z4.s, z11.s\n"
+    "fmla z16.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x13, x17, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z10.s\n"
+    "fmla z18.s, p3/M, z8.s, z12.s\n"
+    "fmla z17.s, p3/M, z7.s, z12.s\n"
+    "fmla z16.s, p3/M, z6.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x12, x17, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z10.s\n"
+    "fmla z27.s, p3/M, z1.s, z10.s\n"
+    "fmla z26.s, p3/M, z0.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x11, x17, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z11.s\n"
+    "fmla z28.s, p3/M, z4.s, z11.s\n"
+    "fmla z25.s, p3/M, z2.s, z11.s\n"
+    "fmla z24.s, p3/M, z1.s, z11.s\n"
+    "fmla z23.s, p3/M, z7.s, z12.s\n"
+    "fmla z22.s, p3/M, z6.s, z12.s\n"
+    "fmla z19.s, p3/M, z4.s, z12.s\n"
+    "fmla z18.s, p3/M, z3.s, z12.s\n"
+    "fmla z21.s, p3/M, z8.s, z10.s\n"
+    "fmla z20.s, p3/M, z7.s, z10.s\n"
+    "fmla z17.s, p3/M, z5.s, z10.s\n"
+    "fmla z16.s, p3/M, z4.s, z10.s\n"
+    "fmax z31.s, p3/M, z31.s, z15.s\n"
+    "fmax z30.s, p3/M, z30.s, z15.s\n"
+    "fmax z29.s, p3/M, z29.s, z15.s\n"
+    "fmax z28.s, p3/M, z28.s, z15.s\n"
+    "fmin z31.s, p3/M, z31.s, z14.s\n"
+    "st1w { z31.s }, p0, [x22, x15, LSL #2]\n"
+    "fmin z30.s, p3/M, z30.s, z14.s\n"
+    "fmin z29.s, p3/M, z29.s, z14.s\n"
+    "ldr x22, [x5, #0x20]\n"
+    "fmin z28.s, p3/M, z28.s, z14.s\n"
+    "st1w { z30.s }, p0, [x21, x15, LSL #2]\n"
+    "fmax z27.s, p3/M, z27.s, z15.s\n"
+    "fmax z26.s, p3/M, z26.s, z15.s\n"
+    "st1w { z29.s }, p0, [x20, x15, LSL #2]\n"
+    "fmax z25.s, p3/M, z25.s, z15.s\n"
+    "st1w { z28.s }, p0, [x19, x15, LSL #2]\n"
+    "fmax z24.s, p3/M, z24.s, z15.s\n"
+    "ldr x21, [x5, #0x28]\n"
+    "fmax z23.s, p3/M, z23.s, z15.s\n"
+    "ldr x20, [x5, #0x30]\n"
+    "fmin z27.s, p3/M, z27.s, z14.s\n"
+    "ldr x19, [x5, #0x38]\n"
+    "fmin z26.s, p3/M, z26.s, z14.s\n"
+    "st1w { z27.s }, p0, [x22, x15, LSL #2]\n"
+    "fmin z25.s, p3/M, z25.s, z14.s\n"
+    "fmin z24.s, p3/M, z24.s, z14.s\n"
+    "st1w { z26.s }, p0, [x21, x15, LSL #2]\n"
+    "fmin z23.s, p3/M, z23.s, z14.s\n"
+    "ldr x22, [x5, #0x40]\n"
+    "fmax z22.s, p3/M, z22.s, z15.s\n"
+    "ldr x21, [x5, #0x48]\n"
+    "fmax z21.s, p3/M, z21.s, z15.s\n"
+    "st1w { z25.s }, p0, [x20, x15, LSL #2]\n"
+    "fmax z20.s, p3/M, z20.s, z15.s\n"
+    "st1w { z24.s }, p0, [x19, x15, LSL #2]\n"
+    "fmax z19.s, p3/M, z19.s, z15.s\n"
+    "st1w { z23.s }, p0, [x22, x15, LSL #2]\n"
+    "fmin z22.s, p3/M, z22.s, z14.s\n"
+    "ldr x20, [x5, #0x50]\n"
+    "fmin z21.s, p3/M, z21.s, z14.s\n"
+    "ldr x19, [x5, #0x58]\n"
+    "fmin z20.s, p3/M, z20.s, z14.s\n"
+    "ldr x22, [x5, #0x60]\n"
+    "fmin z19.s, p3/M, z19.s, z14.s\n"
+    "st1w { z22.s }, p0, [x21, x15, LSL #2]\n"
+    "fmax z18.s, p3/M, z18.s, z15.s\n"
+    "st1w { z21.s }, p0, [x20, x15, LSL #2]\n"
+    "fmax z17.s, p3/M, z17.s, z15.s\n"
+    "st1w { z20.s }, p0, [x19, x15, LSL #2]\n"
+    "fmax z16.s, p3/M, z16.s, z15.s\n"
+    "st1w { z19.s }, p0, [x22, x15, LSL #2]\n"
+    "ldr x21, [x5, #0x68]\n"
+    "fmin z18.s, p3/M, z18.s, z14.s\n"
+    "ldr x20, [x5, #0x70]\n"
+    "fmin z17.s, p3/M, z17.s, z14.s\n"
+    "ldr x19, [x5, #0x78]\n"
+    "fmin z16.s, p3/M, z16.s, z14.s\n"
+    "st1w { z18.s }, p0, [x21, x15, LSL #2]\n"
+    "st1w { z17.s }, p0, [x20, x15, LSL #2]\n"
+    "st1w { z16.s }, p0, [x19, x15, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..f5b6a4f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+  sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..ad53872
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x5, #0x0\n"
+    "mov x6, #0x0\n"
+    "1:"  // Tile loop
+    "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x21, #0x4\n"
+    "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "cntb x7\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x7, x7, XZR, LSL #4\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cntb x17\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "cntb x15\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x5, x20\n" // offset = tile_i * ld_input_row
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x6, x16, x19\n" // offset += tile_j * ld_input_col
+    "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x21\n" // offset *= kernel_stride * output_size
+    "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x14, x14, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x11, x14, x20, LSL #2\n"
+    "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x10, x11, x20, LSL #2\n"
+    "ld1w { z17.s }, p3/Z, [x8]\n"
+    "mov z31.d, z17.d\n"
+    "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "add x9, x10, x20, LSL #2\n"
+    "mov z30.d, z17.d\n"
+    "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "add x28, x9, x20, LSL #2\n"
+    "mov z29.d, z17.d\n"
+    "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "add x27, x16, x16\n"
+    "mov z28.d, z17.d\n"
+    "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "add x26, x27, x16\n"
+    "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "add x25, x26, x16\n"
+    "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "add x17, x17, x16, LSL #4\n"
+    "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "add x15, x15, x27, LSL #4\n"
+    "cntb x24\n"
+    "prfm pldl1keep, [x10, x15]\n"
+    "prfm pldl1keep, [x14, x7]\n"
+    "add x24, x24, x26, LSL #4\n"
+    "prfm pldl1keep, [x14, x17]\n"
+    "cntb x23\n"
+    "prfm pldl1keep, [x14, x24]\n"
+    "add x23, x23, x25, LSL #4\n"
+    "mov x20, #0x2\n"
+    "prfm pldl1keep, [x14, x23]\n"
+    "prfm pldl1keep, [x11, x7]\n"
+    "mul x19, x5, x22\n" // offset = tile_i * ld_output_row
+    "prfm pldl1keep, [x11, x17]\n"
+    "madd x19, x6, x13, x19\n" // offset += tile_j * ld_output_col
+    "prfm pldl1keep, [x14, x15]\n"
+    "mul x19, x19, x20\n" // offset *= output_tile_size
+    "mov x21, #0x0\n"
+    "add x12, x12, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "add x22, x12, x22, LSL #2\n"
+    "cntw x20\n"
+    "sub x19, XZR, x20\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z9.s }, p2/Z, [x10, x27, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x14]\n"
+    "addvl x8, x8, #16\n"
+    "ld1w { z11.s }, p2/Z, [x14, x16, LSL #2]\n"
+    "cmp x20, %x[n_channels]\n"
+    "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "addvl x8, x8, #-6\n"
+    "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x14, x25, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x11]\n"
+    "ld1w { z15.s }, p2/Z, [x11, x16, LSL #2]\n"
+    "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "prfm pldl1keep, [x11, x24]\n"
+    "whilelt p1.s, x20, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "prfm pldl1keep, [x11, x23]\n"
+    "incw x19\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "prfm pldl1keep, [x11, x15]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "prfm pldl1keep, [x9, x7]\n"
+    "incw x21\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "prfm pldl1keep, [x10, x7]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x25, LSL #2]\n"
+    "incw x20\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x26, LSL #2]\n"
+    "prfm pldl1keep, [x9, x17]\n"
+    "fmla z30.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x9]\n"
+    "addvl x11, x11, #1\n"
+    "fmla z30.s, p3/M, z0.s, z16.s\n"
+    "prfm pldl1keep, [x10, x17]\n"
+    "prfm pldl1keep, [x9, x24]\n"
+    "fmla z29.s, p3/M, z3.s, z14.s\n"
+    "prfm pldl1keep, [x10, x24]\n"
+    "ld1w { z14.s }, p2/Z, [x9, x25, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z15.s\n"
+    "ld1w { z15.s }, p2/Z, [x10]\n"
+    "fmla z30.s, p3/M, z4.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x9, x16, LSL #2]\n"
+    "fmla z29.s, p3/M, z0.s, z15.s\n"
+    "prfm pldl1keep, [x9, x23]\n"
+    "prfm pldl1keep, [x28, x7]\n"
+    "fmla z31.s, p3/M, z2.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x10, x25, LSL #2]\n"
+    "prfm pldl1keep, [x10, x23]\n"
+    "fmla z31.s, p3/M, z5.s, z13.s\n"
+    "addvl x10, x10, #1\n"
+    "fmla z30.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
+    "prfm pldl1keep, [x28, x17]\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "prfm pldl1keep, [x9, x15]\n"
+    "prfm pldl1keep, [x28, x24]\n"
+    "fmla z31.s, p3/M, z6.s, z15.s\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z15.s }, p2/Z, [x28]\n"
+    "fmla z30.s, p3/M, z7.s, z12.s\n"
+    "ld1w { z13.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "prfm pldl1keep, [x28, x15]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x28, x23]\n"
+    "fmla z29.s, p3/M, z6.s, z15.s\n"
+    "ld1w { z15.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z9.s }, p1/Z, [x10, x27, LSL #2]\n"
+    "prfm pldl1keep, [x10, x15]\n"
+    "fmax z31.s, p3/M, z31.s, z19.s\n"
+    "ld1w { z10.s }, p1/Z, [x14]\n"
+    "fmla z28.s, p3/M, z5.s, z14.s\n"
+    "fmla z29.s, p3/M, z7.s, z13.s\n"
+    "ld1w { z14.s }, p2/Z, [x28, x26, LSL #2]\n"
+    "fmax z30.s, p3/M, z30.s, z19.s\n"
+    "prfm pldl1keep, [x14, x7]\n"
+    "prfm pldl1keep, [x14, x17]\n"
+    "fmin z31.s, p3/M, z31.s, z18.s\n"
+    "ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "whilelt p2.s, x21, %x[n_channels]\n"
+    "fmin z30.s, p3/M, z30.s, z18.s\n"
+    "prfm pldl1keep, [x14, x24]\n"
+    "addvl x28, x28, #1\n"
+    "fmla z28.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z13.s }, p1/Z, [x14, x25, LSL #2]\n"
+    "cmp x20, %x[n_channels]\n"
+    "fmla z29.s, p3/M, z8.s, z15.s\n"
+    "prfm pldl1keep, [x14, x23]\n"
+    "prfm pldl1keep, [x11, x7]\n"
+    "fmla z28.s, p3/M, z7.s, z14.s\n"
+    "ld1w { z14.s }, p1/Z, [x11]\n"
+    "prfm pldl1keep, [x11, x17]\n"
+    "fmax z29.s, p3/M, z29.s, z19.s\n"
+    "ld1w { z16.s }, p1/Z, [x14, x27, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z15.s\n"
+    "ld1w { z15.s }, p1/Z, [x11, x16, LSL #2]\n"
+    "prfm pldl1keep, [x14, x15]\n"
+    "fmin z29.s, p3/M, z29.s, z18.s\n"
+    "st1w { z31.s }, p0, [x12]\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x14, x16, LSL #2]\n"
+    "st1w { z30.s }, p0, [x12, x13, LSL #2]\n"
+    "fmax z28.s, p3/M, z28.s, z19.s\n"
+    "st1w { z29.s }, p0, [x22]\n"
+    "addvl x12, x12, #1\n"
+    "fmin z28.s, p3/M, z28.s, z18.s\n"
+    "ld1w { z17.s }, p3/Z, [x8]\n"
+    "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "mov z31.d, z17.d\n"
+    "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "mov z30.d, z17.d\n"
+    "st1w { z28.s }, p0, [x22, x13, LSL #2]\n"
+    "addvl x22, x22, #1\n"
+    "mov z29.d, z17.d\n"
+    "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "mov z28.d, z17.d\n"
+    "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "addvl x8, x8, #-6\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "prfm pldl1keep, [x11, x24]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "prfm pldl1keep, [x11, x23]\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "prfm pldl1keep, [x11, x15]\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "prfm pldl1keep, [x9, x7]\n"
+    "prfm pldl1keep, [x10, x7]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "prfm pldl1keep, [x9, x17]\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x11, x25, LSL #2]\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x11, x26, LSL #2]\n"
+    "prfm pldl1keep, [x10, x17]\n"
+    "fmla z30.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x11, x27, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x9]\n"
+    "prfm pldl1keep, [x9, x24]\n"
+    "fmla z30.s, p3/M, z0.s, z16.s\n"
+    "prfm pldl1keep, [x10, x24]\n"
+    "fmla z29.s, p3/M, z3.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x9, x25, LSL #2]\n"
+    "prfm pldl1keep, [x9, x23]\n"
+    "fmla z31.s, p3/M, z4.s, z15.s\n"
+    "ld1w { z15.s }, p2/Z, [x10]\n"
+    "fmla z30.s, p3/M, z4.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x9, x16, LSL #2]\n"
+    "fmla z29.s, p3/M, z0.s, z15.s\n"
+    "prfm pldl1keep, [x28, x7]\n"
+    "prfm pldl1keep, [x10, x23]\n"
+    "fmla z31.s, p3/M, z2.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x10, x16, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x10, x25, LSL #2]\n"
+    "prfm pldl1keep, [x28, x17]\n"
+    "fmla z31.s, p3/M, z5.s, z13.s\n"
+    "prfm pldl1keep, [x9, x15]\n"
+    "fmla z30.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "prfm pldl1keep, [x28, x24]\n"
+    "prfm pldl1keep, [x28, x15]\n"
+    "fmla z31.s, p3/M, z6.s, z15.s\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z15.s }, p2/Z, [x28]\n"
+    "fmla z30.s, p3/M, z7.s, z12.s\n"
+    "ld1w { z13.s }, p2/Z, [x28, x16, LSL #2]\n"
+    "prfm pldl1keep, [x28, x23]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x9, x27, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x5, #0x1\n"
+    "fmla z28.s, p3/M, z5.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x28, x26, LSL #2]\n"
+    "fmla z29.s, p3/M, z6.s, z15.s\n"
+    "ld1w { z15.s }, p2/Z, [x28, x27, LSL #2]\n"
+    "fmla z30.s, p3/M, z8.s, z11.s\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "add x6, x6, #0x1\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "fmla z29.s, p3/M, z7.s, z13.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmax z31.s, p3/M, z31.s, z19.s\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x6, x19\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "fmla z28.s, p3/M, z3.s, z16.s\n"
+    "csel x6, x6, XZR, LT\n"
+    "fmin z31.s, p3/M, z31.s, z18.s\n"
+    "st1w { z31.s }, p0, [x12]\n"
+    "fmla z28.s, p3/M, z7.s, z14.s\n"
+    "csel x5, x5, x21, LT\n"
+    "fmla z29.s, p3/M, z8.s, z15.s\n"
+    "cmp x5, x20\n"
+    "fmax z30.s, p3/M, z30.s, z19.s\n"
+    "fmla z28.s, p3/M, z6.s, z15.s\n"
+    "fmin z30.s, p3/M, z30.s, z18.s\n"
+    "st1w { z30.s }, p0, [x12, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "fmax z29.s, p3/M, z29.s, z19.s\n"
+    "fmin z29.s, p3/M, z29.s, z18.s\n"
+    "st1w { z29.s }, p0, [x22]\n"
+    "fmax z28.s, p3/M, z28.s, z19.s\n"
+    "fmin z28.s, p3/M, z28.s, z18.s\n"
+    "st1w { z28.s }, p0, [x22, x13, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..06b3575
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[25];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[12];
+      inptrs[1] = input_ptrs[0];
+      inptrs[2] = input_ptrs[1];
+      inptrs[3] = input_ptrs[3];
+      inptrs[4] = input_ptrs[4];
+      inptrs[5] = input_ptrs[5];
+      inptrs[6] = input_ptrs[6];
+      inptrs[7] = input_ptrs[2];
+      inptrs[8] = input_ptrs[8];
+      inptrs[9] = input_ptrs[9];
+      inptrs[10] = input_ptrs[7];
+      inptrs[11] = input_ptrs[15];
+      inptrs[12] = input_ptrs[10];
+      inptrs[13] = input_ptrs[16];
+      inptrs[14] = input_ptrs[11];
+      inptrs[15] = input_ptrs[18];
+      inptrs[16] = input_ptrs[13];
+      inptrs[17] = input_ptrs[19];
+      inptrs[18] = input_ptrs[20];
+      inptrs[19] = input_ptrs[14];
+      inptrs[20] = input_ptrs[21];
+      inptrs[21] = input_ptrs[17];
+      inptrs[22] = input_ptrs[23];
+      inptrs[23] = input_ptrs[22];
+      inptrs[24] = input_ptrs[24];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cntb x12, ALL, MUL #2\n"
+    "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x11, #0x0\n"
+    "ldp x10, x9, [x19, #0x0]\n"
+    "cntw x28\n"
+    "ldp x27, x26, [x19, #0x10]\n"
+    "sub x25, XZR, x28\n"
+    "ld1w { z17.s }, p3/Z, [x14]\n"
+    "mov z31.d, z17.d\n"
+    "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "mov z30.d, z17.d\n"
+    "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n"
+    "cmp x28, %x[n_channels]\n"
+    "mov z29.d, z17.d\n"
+    "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n"
+    "mov z28.d, z17.d\n"
+    "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n"
+    "addvl x14, x14, #16\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n"
+    "addvl x14, x14, #-6\n"
+    "ld1w { z9.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "ld1w { z10.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "ldp x24, x23, [x13, #0x10]\n"
+    "ldp x22, x21, [x13, #0x20]\n"
+    "ldp x20, x19, [x13, #0x30]\n"
+    "ld1w { z11.s }, p2/Z, [x24, x11, LSL #2]\n"
+    "prfm pldl1keep, [x24, x12]\n"
+    "ld1w { z12.s }, p2/Z, [x23, x11, LSL #2]\n"
+    "prfm pldl1keep, [x23, x12]\n"
+    "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "ld1w { z14.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "ld1w { z15.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x22, [x13, #0x40]\n"
+    "whilelt p1.s, x28, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x13, #0x48]\n"
+    "incw x25\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ldr x20, [x13, #0x50]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "ldr x19, [x13, #0x58]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla z30.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "fmla z30.s, p3/M, z0.s, z16.s\n"
+    "ldr x21, [x13, #0x60]\n"
+    "fmla z29.s, p3/M, z3.s, z14.s\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr x19, [x13, #0x70]\n"
+    "fmla z31.s, p3/M, z4.s, z15.s\n"
+    "ld1w { z15.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z11.s\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "fmla z29.s, p3/M, z0.s, z15.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla z31.s, p3/M, z2.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "ldr x19, [x13, #0x78]\n"
+    "ldr x21, [x13, #0x80]\n"
+    "fmla z31.s, p3/M, z5.s, z13.s\n"
+    "ldr x20, [x13, #0x88]\n"
+    "fmla z30.s, p3/M, z3.s, z13.s\n"
+    "ldr x24, [x13, #0x90]\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z13.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "fmla z31.s, p3/M, z6.s, z15.s\n"
+    "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z12.s\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z15.s }, p2/Z, [x24, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x24, x12]\n"
+    "ldr x23, [x13, #0x98]\n"
+    "fmla z29.s, p3/M, z6.s, z15.s\n"
+    "ldr x22, [x13, #0xa0]\n"
+    "fmax z31.s, p3/M, z31.s, z19.s\n"
+    "ldr x21, [x13, #0xa8]\n"
+    "fmla z28.s, p3/M, z5.s, z14.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x11, LSL #2]\n"
+    "prfm pldl1keep, [x23, x12]\n"
+    "fmin z31.s, p3/M, z31.s, z18.s\n"
+    "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "fmla z30.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "fmla z29.s, p3/M, z7.s, z13.s\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "fmax z30.s, p3/M, z30.s, z19.s\n"
+    "ldr x19, [x13, #0xb8]\n"
+    "ldr x22, [x13, #0xc0]\n"
+    "fmla z28.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmin z30.s, p3/M, z30.s, z18.s\n"
+    "ld1w { z15.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "fmla z28.s, p3/M, z7.s, z14.s\n"
+    "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z15.s\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "incw x11\n"
+    "fmla z28.s, p3/M, z6.s, z15.s\n"
+    "ldp x21, x20, [x13, #0x0]\n"
+    "whilelt p2.s, x11, %x[n_channels]\n"
+    "fmax z29.s, p3/M, z29.s, z19.s\n"
+    "ldp x24, x23, [x13, #0x10]\n"
+    "addvl x12, x12, #1\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z9.s }, p1/Z, [x21, x28, LSL #2]\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "fmin z29.s, p3/M, z29.s, z18.s\n"
+    "ld1w { z10.s }, p1/Z, [x20, x28, LSL #2]\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmax z28.s, p3/M, z28.s, z19.s\n"
+    "ld1w { z11.s }, p1/Z, [x24, x28, LSL #2]\n"
+    "prfm pldl1keep, [x24, x12]\n"
+    "fmin z28.s, p3/M, z28.s, z18.s\n"
+    "ld1w { z12.s }, p1/Z, [x23, x28, LSL #2]\n"
+    "prfm pldl1keep, [x23, x12]\n"
+    "ldp x22, x21, [x13, #0x20]\n"
+    "ldp x20, x19, [x13, #0x30]\n"
+    "st1w { z31.s }, p0, [x10, x25, LSL #2]\n"
+    "ld1w { z13.s }, p1/Z, [x22, x28, LSL #2]\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "ld1w { z14.s }, p1/Z, [x21, x28, LSL #2]\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "ld1w { z15.s }, p1/Z, [x20, x28, LSL #2]\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "ld1w { z16.s }, p1/Z, [x19, x28, LSL #2]\n"
+    "incw x28\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "cmp x28, %x[n_channels]\n"
+    "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+    "st1w { z29.s }, p0, [x27, x25, LSL #2]\n"
+    "st1w { z28.s }, p0, [x26, x25, LSL #2]\n"
+    "ld1w { z17.s }, p3/Z, [x14]\n"
+    "mov z31.d, z17.d\n"
+    "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n"
+    "mov z30.d, z17.d\n"
+    "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n"
+    "mov z29.d, z17.d\n"
+    "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n"
+    "mov z28.d, z17.d\n"
+    "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n"
+    "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n"
+    "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n"
+    "addvl x14, x14, #16\n"
+    "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n"
+    "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n"
+    "addvl x14, x14, #-6\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.s, p3/M, z8.s, z9.s\n"
+    "ldr x22, [x13, #0x40]\n"
+    "incw x25\n"
+    "fmla z30.s, p3/M, z6.s, z9.s\n"
+    "ldr x21, [x13, #0x48]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ldr x20, [x13, #0x50]\n"
+    "fmla z28.s, p3/M, z0.s, z9.s\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "ldr x19, [x13, #0x58]\n"
+    "fmla z31.s, p3/M, z0.s, z10.s\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla z30.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "fmla z31.s, p3/M, z3.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "fmla z30.s, p3/M, z0.s, z16.s\n"
+    "ldr x21, [x13, #0x60]\n"
+    "fmla z29.s, p3/M, z3.s, z14.s\n"
+    "ldr x20, [x13, #0x68]\n"
+    "ldr x19, [x13, #0x70]\n"
+    "fmla z31.s, p3/M, z4.s, z15.s\n"
+    "ld1w { z15.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z11.s\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "fmla z29.s, p3/M, z0.s, z15.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla z31.s, p3/M, z2.s, z16.s\n"
+    "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "fmla z30.s, p3/M, z5.s, z12.s\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "fmla z29.s, p3/M, z4.s, z11.s\n"
+    "ldr x19, [x13, #0x78]\n"
+    "ldr x21, [x13, #0x80]\n"
+    "fmla z31.s, p3/M, z5.s, z13.s\n"
+    "ldr x20, [x13, #0x88]\n"
+    "fmla z30.s, p3/M, z3.s, z13.s\n"
+    "ldr x24, [x13, #0x90]\n"
+    "fmla z29.s, p3/M, z1.s, z16.s\n"
+    "ld1w { z13.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "fmla z31.s, p3/M, z6.s, z15.s\n"
+    "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "fmla z30.s, p3/M, z7.s, z12.s\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmla z31.s, p3/M, z7.s, z16.s\n"
+    "ld1w { z15.s }, p2/Z, [x24, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x24, x12]\n"
+    "ldr x23, [x13, #0x98]\n"
+    "fmla z29.s, p3/M, z6.s, z15.s\n"
+    "ldr x22, [x13, #0xa0]\n"
+    "fmax z31.s, p3/M, z31.s, z19.s\n"
+    "ldr x21, [x13, #0xa8]\n"
+    "fmla z28.s, p3/M, z5.s, z14.s\n"
+    "ld1w { z11.s }, p2/Z, [x23, x11, LSL #2]\n"
+    "prfm pldl1keep, [x23, x12]\n"
+    "fmin z31.s, p3/M, z31.s, z18.s\n"
+    "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "fmla z30.s, p3/M, z8.s, z11.s\n"
+    "ld1w { z16.s }, p2/Z, [x21, x11, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "prfm pldl1keep, [x21, x12]\n"
+    "fmla z29.s, p3/M, z7.s, z13.s\n"
+    "ldr x20, [x13, #0xb0]\n"
+    "fmax z30.s, p3/M, z30.s, z19.s\n"
+    "ldr x19, [x13, #0xb8]\n"
+    "ldr x22, [x13, #0xc0]\n"
+    "fmla z28.s, p3/M, z3.s, z16.s\n"
+    "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n"
+    "fmla z29.s, p3/M, z5.s, z16.s\n"
+    "prfm pldl1keep, [x20, x12]\n"
+    "fmin z30.s, p3/M, z30.s, z18.s\n"
+    "ld1w { z15.s }, p2/Z, [x19, x11, LSL #2]\n"
+    "prfm pldl1keep, [x19, x12]\n"
+    "fmla z28.s, p3/M, z7.s, z14.s\n"
+    "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n"
+    "fmla z29.s, p3/M, z8.s, z15.s\n"
+    "prfm pldl1keep, [x22, x12]\n"
+    "st1w { z31.s }, p0, [x10, x25, LSL #2]\n"
+    "fmla z28.s, p3/M, z6.s, z15.s\n"
+    "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+    "fmax z29.s, p3/M, z29.s, z19.s\n"
+    "fmla z28.s, p3/M, z8.s, z11.s\n"
+    "fmin z29.s, p3/M, z29.s, z18.s\n"
+    "st1w { z29.s }, p0, [x27, x25, LSL #2]\n"
+    "fmax z28.s, p3/M, z28.s, z19.s\n"
+    "fmin z28.s, p3/M, z28.s, z18.s\n"
+    "st1w { z28.s }, p0, [x26, x25, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..d49f7fd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
+  typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  indirect_kern_type indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+  direct_kern_type direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+  sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000..f751186
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp

@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+  const unsigned int n_tile_rows,
+  const unsigned int n_tile_cols,
+  const float *inptr,
+  int64_t ld_input_row,
+  int64_t ld_input_col,
+  float *outptr,
+  int64_t ld_output_row,
+  int64_t ld_output_col,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    const uint64_t n_tile_rows, n_tile_cols;
+    const float *inptr;
+    const uint64_t ld_input_row;
+    const uint64_t ld_input_col;
+    float *outptr;
+    const uint64_t ld_output_row;
+    const uint64_t ld_output_col;
+    const void *params;
+    const float min, max;
+
+    uint64_t tile_i = 0, tile_j = 0;
+
+    Args(
+      const unsigned int n_tile_rows,
+      const unsigned int n_tile_cols,
+      const float *inptr,
+      int64_t ld_input_row,
+      int64_t ld_input_col,
+      float *outptr,
+      int64_t ld_output_row,
+      int64_t ld_output_col,
+      const void *params,
+      const float activation_min,
+      const float activation_max
+    ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+        ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+        ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+        params(params), min(activation_min), max(activation_max)
+    {
+    }
+  };
+
+  Args params_struct(
+    n_tile_rows, n_tile_cols,
+    inptr, ld_input_row, ld_input_col,
+    outptr, ld_output_row, ld_output_col,
+    params, activation_min, activation_max
+  );
+
+  __asm__ __volatile__(
+    "ptrue p3.b\n"
+    "mov x5, #0x0\n"
+    "mov x6, #0x0\n"
+    "1:"  // Tile loop
+    "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "mov x20, #0x2\n"
+    "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "mov x7, #0x2\n"
+    "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+    "mov x17, #0x0\n"
+    "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+    "cntw x16\n"
+    "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+    "sub x14, XZR, x16\n"
+    "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+    "mul x19, x5, x22\n" // offset = tile_i * ld_input_row
+    "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+    "madd x19, x6, x15, x19\n" // offset += tile_j * ld_input_col
+    "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+    "mul x19, x19, x20\n" // offset *= kernel_stride * output_size
+    "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+    "add x13, x13, x19, LSL #2\n" // inptr[0] += offset * sizeof(float)
+    "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "add x20, x13, x22, LSL #2\n"
+    "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "add x10, x20, x22, LSL #2\n"
+    "ld1w { z16.s }, p3/Z, [x8]\n"
+    "mov z31.d, z16.d\n"
+    "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "add x9, x10, x22, LSL #2\n"
+    "mov z30.d, z16.d\n"
+    "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "add x28, x9, x22, LSL #2\n"
+    "mov z29.d, z16.d\n"
+    "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "add x27, x28, x22, LSL #2\n"
+    "mov z28.d, z16.d\n"
+    "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "add x26, x15, x15\n"
+    "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "add x25, x26, x15\n"
+    "mul x19, x5, x21\n" // offset = tile_i * ld_output_row
+    "add x24, x25, x15\n"
+    "add x23, x24, x15\n"
+    "madd x19, x6, x12, x19\n" // offset += tile_j * ld_output_col
+    "mul x19, x19, x7\n" // offset *= output_tile_size
+    "add x11, x11, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+    "add x22, x11, x21, LSL #2\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "ld1w { z5.s }, p2/Z, [x13]\n"
+    "ld1w { z6.s }, p2/Z, [x13, x15, LSL #2]\n"
+    "cmp x16, %x[n_channels]\n"
+    "ld1w { z7.s }, p2/Z, [x20]\n"
+    "addvl x8, x8, #6\n"
+    "ld1w { z8.s }, p2/Z, [x20, x15, LSL #2]\n"
+    "ld1w { z9.s }, p2/Z, [x13, x26, LSL #2]\n"
+    "ld1w { z13.s }, p2/Z, [x20, x26, LSL #2]\n"
+    "ld1w { z11.s }, p2/Z, [x13, x25, LSL #2]\n"
+    "ld1w { z12.s }, p2/Z, [x13, x24, LSL #2]\n"
+    "ld1w { z10.s }, p2/Z, [x20, x23, LSL #2]\n"
+    "ld1w { z14.s }, p2/Z, [x10]\n"
+    "bge 3f\n"
+    "2:"  // Tile loop: Channel loop
+    "fmla z31.s, p3/M, z0.s, z5.s\n"
+    "ld1w { z5.s }, p2/Z, [x20, x25, LSL #2]\n"
+    "whilelt p1.s, x16, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z0.s, z6.s\n"
+    "incw x14\n"
+    "fmla z29.s, p3/M, z0.s, z7.s\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z0.s, z8.s\n"
+    "ld1w { z0.s }, p3/Z, [x8]\n"
+    "incw x17\n"
+    "fmla z31.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x20, x24, LSL #2]\n"
+    "addvl x20, x20, #1\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "incw x16\n"
+    "fmla z29.s, p3/M, z1.s, z8.s\n"
+    "fmla z28.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z1.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x13, x23, LSL #2]\n"
+    "addvl x13, x13, #1\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "fmla z28.s, p3/M, z2.s, z5.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "fmla z29.s, p3/M, z3.s, z5.s\n"
+    "fmla z28.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x10, x25, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z6.s\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "fmla z31.s, p3/M, z0.s, z7.s\n"
+    "ld1w { z7.s }, p1/Z, [x20]\n"
+    "fmla z30.s, p3/M, z0.s, z8.s\n"
+    "fmla z29.s, p3/M, z0.s, z14.s\n"
+    "fmla z28.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z8.s\n"
+    "ld1w { z8.s }, p2/Z, [x10, x23, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z13.s\n"
+    "fmla z29.s, p3/M, z1.s, z11.s\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "fmla z31.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x10, x24, LSL #2]\n"
+    "addvl x10, x10, #1\n"
+    "fmla z30.s, p3/M, z2.s, z5.s\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "fmla z28.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "fmla z31.s, p3/M, z3.s, z5.s\n"
+    "ld1w { z5.s }, p2/Z, [x9]\n"
+    "ld1w { z16.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "fmla z30.s, p3/M, z3.s, z6.s\n"
+    "fmla z29.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x9, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x9, x26, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "fmla z28.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "fmla z31.s, p3/M, z0.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x9, x23, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "fmla z29.s, p3/M, z0.s, z5.s\n"
+    "fmla z28.s, p3/M, z0.s, z6.s\n"
+    "ld1w { z0.s }, p3/Z, [x8, #-6, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "fmla z29.s, p3/M, z1.s, z6.s\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z1.s }, p3/Z, [x8, #-5, MUL VL]\n"
+    "fmla z31.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x9, x24, LSL #2]\n"
+    "addvl x9, x9, #1\n"
+    "fmla z30.s, p3/M, z2.s, z9.s\n"
+    "fmla z29.s, p3/M, z2.s, z10.s\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #-4, MUL VL]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x28]\n"
+    "fmla z30.s, p3/M, z3.s, z13.s\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #-3, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z8.s }, p2/Z, [x28, x24, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "fmla z28.s, p3/M, z4.s, z14.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #-2, MUL VL]\n"
+    "fmla z31.s, p3/M, z0.s, z5.s\n"
+    "ld1w { z5.s }, p2/Z, [x28, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z6.s\n"
+    "fmla z29.s, p3/M, z0.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z0.s }, p3/Z, [x8, #-1, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z13.s\n"
+    "fmla z28.s, p3/M, z1.s, z5.s\n"
+    "ld1w { z1.s }, p3/Z, [x8]\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x23, LSL #2]\n"
+    "addvl x28, x28, #1\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z5.s\n"
+    "fmla z28.s, p3/M, z2.s, z6.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x27]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "fmla z29.s, p3/M, z3.s, z6.s\n"
+    "fmla z28.s, p3/M, z3.s, z8.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z14.s\n"
+    "ld1w { z14.s }, p1/Z, [x10]\n"
+    "fmla z29.s, p3/M, z4.s, z8.s\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x27, x25, LSL #2]\n"
+    "fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z13.s }, p1/Z, [x20, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z5.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "fmla z31.s, p3/M, z2.s, z5.s\n"
+    "ld1w { z5.s }, p1/Z, [x13]\n"
+    "fmla z30.s, p3/M, z2.s, z6.s\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x23, LSL #2]\n"
+    "whilelt p2.s, x17, %x[n_channels]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x27, x27, #1\n"
+    "fmla z31.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z6.s }, p1/Z, [x13, x15, LSL #2]\n"
+    "addvl x8, x8, #16\n"
+    "fmla z30.s, p3/M, z3.s, z8.s\n"
+    "cmp x16, %x[n_channels]\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p1/Z, [x13, x25, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z8.s }, p1/Z, [x20, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z10.s }, p1/Z, [x20, x23, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p1/Z, [x13, x24, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z9.s\n"
+    "ld1w { z9.s }, p1/Z, [x13, x26, LSL #2]\n"
+    "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "addvl x8, x8, #-6\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "st1w { z31.s }, p0, [x11]\n"
+    "mov z31.d, z16.d\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "st1w { z30.s }, p0, [x11, x12, LSL #2]\n"
+    "mov z30.d, z16.d\n"
+    "addvl x11, x11, #1\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z29.s }, p0, [x22]\n"
+    "mov z29.d, z16.d\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z28.s }, p0, [x22, x12, LSL #2]\n"
+    "mov z28.d, z16.d\n"
+    "addvl x22, x22, #1\n"
+    "blt 2b\n"
+    "3:"  // Tile loop: Channel tail
+    "fmla z31.s, p3/M, z0.s, z5.s\n"
+    "ld1w { z5.s }, p2/Z, [x20, x25, LSL #2]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z30.s, p3/M, z0.s, z6.s\n"
+    "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+    "add x21, x5, #0x1\n"
+    "fmla z29.s, p3/M, z0.s, z7.s\n"
+    "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+    "fmla z28.s, p3/M, z0.s, z8.s\n"
+    "ld1w { z0.s }, p3/Z, [x8]\n"
+    "add x6, x6, #0x1\n"
+    "fmla z31.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x20, x24, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+    "fmla z29.s, p3/M, z1.s, z8.s\n"
+    "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+    "cmp x6, x19\n"
+    "fmla z28.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z1.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x13, x23, LSL #2]\n"
+    "csel x6, x6, XZR, LT\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "csel x5, x5, x21, LT\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "cmp x5, x20\n"
+    "fmla z28.s, p3/M, z2.s, z5.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "fmla z29.s, p3/M, z3.s, z5.s\n"
+    "fmla z28.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x10, x25, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z6.s\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #4, MUL VL]\n"
+    "fmla z31.s, p3/M, z0.s, z7.s\n"
+    "fmla z30.s, p3/M, z0.s, z8.s\n"
+    "fmla z29.s, p3/M, z0.s, z14.s\n"
+    "fmla z28.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z8.s\n"
+    "ld1w { z8.s }, p2/Z, [x10, x23, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z13.s\n"
+    "fmla z29.s, p3/M, z1.s, z11.s\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n"
+    "fmla z31.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x10, x24, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z5.s\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "fmla z28.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n"
+    "addvl x8, x8, #16\n"
+    "fmla z31.s, p3/M, z3.s, z5.s\n"
+    "ld1w { z5.s }, p2/Z, [x9]\n"
+    "fmla z30.s, p3/M, z3.s, z6.s\n"
+    "fmla z29.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z3.s, z13.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x9, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x9, x26, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "fmla z28.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n"
+    "fmla z31.s, p3/M, z0.s, z14.s\n"
+    "ld1w { z14.s }, p2/Z, [x9, x23, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "fmla z29.s, p3/M, z0.s, z5.s\n"
+    "fmla z28.s, p3/M, z0.s, z6.s\n"
+    "ld1w { z0.s }, p3/Z, [x8, #-6, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "fmla z29.s, p3/M, z1.s, z6.s\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "ld1w { z1.s }, p3/Z, [x8, #-5, MUL VL]\n"
+    "fmla z31.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x9, x24, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z9.s\n"
+    "fmla z29.s, p3/M, z2.s, z10.s\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #-4, MUL VL]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x28]\n"
+    "fmla z30.s, p3/M, z3.s, z13.s\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #-3, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z8.s }, p2/Z, [x28, x24, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "fmla z28.s, p3/M, z4.s, z14.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #-2, MUL VL]\n"
+    "fmla z31.s, p3/M, z0.s, z5.s\n"
+    "ld1w { z5.s }, p2/Z, [x28, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z6.s\n"
+    "fmla z29.s, p3/M, z0.s, z9.s\n"
+    "fmla z28.s, p3/M, z0.s, z13.s\n"
+    "ld1w { z0.s }, p3/Z, [x8, #-1, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x28, x25, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z10.s\n"
+    "fmla z29.s, p3/M, z1.s, z13.s\n"
+    "fmla z28.s, p3/M, z1.s, z5.s\n"
+    "ld1w { z1.s }, p3/Z, [x8]\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z10.s }, p2/Z, [x28, x23, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "fmla z29.s, p3/M, z2.s, z5.s\n"
+    "fmla z28.s, p3/M, z2.s, z6.s\n"
+    "ld1w { z2.s }, p3/Z, [x8, #1, MUL VL]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x27]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "fmla z29.s, p3/M, z3.s, z6.s\n"
+    "fmla z28.s, p3/M, z3.s, z8.s\n"
+    "ld1w { z3.s }, p3/Z, [x8, #2, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z14.s\n"
+    "fmla z29.s, p3/M, z4.s, z8.s\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z4.s }, p3/Z, [x8, #3, MUL VL]\n"
+    "fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x26, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x27, x25, LSL #2]\n"
+    "fmla z28.s, p3/M, z0.s, z12.s\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "fmla z30.s, p3/M, z1.s, z5.s\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n"
+    "fmla z28.s, p3/M, z1.s, z9.s\n"
+    "fmla z31.s, p3/M, z2.s, z5.s\n"
+    "fmla z30.s, p3/M, z2.s, z6.s\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x27, x23, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "fmla z31.s, p3/M, z3.s, z6.s\n"
+    "fmla z30.s, p3/M, z3.s, z8.s\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "fmla z31.s, p3/M, z4.s, z8.s\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "fmla z28.s, p3/M, z4.s, z9.s\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "st1w { z31.s }, p0, [x11]\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z30.s }, p0, [x11, x12, LSL #2]\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z29.s }, p0, [x22]\n"
+    "st1w { z28.s }, p0, [x22, x12, LSL #2]\n"
+    "blt 1b\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000..6e35ee8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp

@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+  const float *const *const input_ptrs,
+  float *const *const outptrs,
+  const void *params,
+  unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  struct Args
+  {
+    float *const *outptrs;
+    const void *params;
+    const float min, max;
+    const float *inptrs[36];
+
+    Args(
+      const float *const *const input_ptrs,
+      float *const *const outptrs,
+      const void *const params,
+      const float min,
+      const float max
+    ) : outptrs(outptrs), params(params), min(min), max(max)
+    {
+      inptrs[0] = input_ptrs[0];
+      inptrs[1] = input_ptrs[1];
+      inptrs[2] = input_ptrs[6];
+      inptrs[3] = input_ptrs[7];
+      inptrs[4] = input_ptrs[2];
+      inptrs[5] = input_ptrs[8];
+      inptrs[6] = input_ptrs[3];
+      inptrs[7] = input_ptrs[4];
+      inptrs[8] = input_ptrs[11];
+      inptrs[9] = input_ptrs[12];
+      inptrs[10] = input_ptrs[9];
+      inptrs[11] = input_ptrs[10];
+      inptrs[12] = input_ptrs[5];
+      inptrs[13] = input_ptrs[13];
+      inptrs[14] = input_ptrs[14];
+      inptrs[15] = input_ptrs[15];
+      inptrs[16] = input_ptrs[16];
+      inptrs[17] = input_ptrs[17];
+      inptrs[18] = input_ptrs[18];
+      inptrs[19] = input_ptrs[19];
+      inptrs[20] = input_ptrs[20];
+      inptrs[21] = input_ptrs[21];
+      inptrs[22] = input_ptrs[22];
+      inptrs[23] = input_ptrs[23];
+      inptrs[24] = input_ptrs[24];
+      inptrs[25] = input_ptrs[25];
+      inptrs[26] = input_ptrs[26];
+      inptrs[27] = input_ptrs[27];
+      inptrs[28] = input_ptrs[28];
+      inptrs[29] = input_ptrs[29];
+      inptrs[30] = input_ptrs[30];
+      inptrs[31] = input_ptrs[31];
+      inptrs[32] = input_ptrs[32];
+      inptrs[33] = input_ptrs[33];
+      inptrs[34] = input_ptrs[34];
+      inptrs[35] = input_ptrs[35];
+
+    }
+  };
+
+  Args params_struct(input_ptrs, outptrs, params,
+                     activation_min, activation_max);
+
+  __asm__ __volatile__(
+    "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+    "ptrue p3.b\n"
+    "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+    "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+    "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+    "cntb x14, ALL, MUL #2\n"
+    "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+    "mov x13, #0x0\n"
+    "ldp x12, x11, [x19, #0x0]\n"
+    "cntw x10\n"
+    "ldp x9, x28, [x19, #0x10]\n"
+    "sub x27, XZR, x10\n"
+    "ld1w { z16.s }, p3/Z, [x16]\n"
+    "mov z31.d, z16.d\n"
+    "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "whilelt p2.s, XZR, %x[n_channels]\n"
+    "mov z30.d, z16.d\n"
+    "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "cmp x10, %x[n_channels]\n"
+    "mov z29.d, z16.d\n"
+    "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "mov z28.d, z16.d\n"
+    "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "addvl x16, x16, #6\n"
+    "ldp x26, x25, [x15, #0x0]\n"
+    "ldp x24, x23, [x15, #0x10]\n"
+    "ldp x20, x19, [x15, #0x20]\n"
+    "ld1w { z5.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "ld1w { z6.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "ld1w { z7.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "ld1w { z8.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "ldp x22, x21, [x15, #0x30]\n"
+    "ldp x20, x19, [x15, #0x40]\n"
+    "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "ld1w { z12.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "ld1w { z14.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "bge 2f\n"
+    "1:"  // Channel loop
+    "fmla z31.s, p3/M, z0.s, z5.s\n"
+    "ldr x21, [x15, #0x50]\n"
+    "whilelt p1.s, x10, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z0.s, z6.s\n"
+    "ldr x19, [x15, #0x58]\n"
+    "incw x27\n"
+    "fmla z29.s, p3/M, z0.s, z7.s\n"
+    "ldr x20, [x15, #0x60]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z28.s, p3/M, z0.s, z8.s\n"
+    "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla z31.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z1.s, z8.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z28.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z0.s }, p3/Z, [x16]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ldr x19, [x15, #0x68]\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z1.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "fmla z28.s, p3/M, z2.s, z5.s\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z3.s, z5.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z28.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z9.s\n"
+    "ldr x19, [x15, #0x78]\n"
+    "fmla z29.s, p3/M, z4.s, z6.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "ldr x26, [x15, #0x80]\n"
+    "fmla z31.s, p3/M, z0.s, z7.s\n"
+    "ld1w { z9.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z8.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z0.s, z14.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z28.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z4.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z8.s\n"
+    "ldr x25, [x15, #0x88]\n"
+    "fmla z30.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "fmla z29.s, p3/M, z1.s, z11.s\n"
+    "ldr x24, [x15, #0x90]\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z8.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z5.s\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "fmla z28.s, p3/M, z2.s, z9.s\n"
+    "ldr x23, [x15, #0x98]\n"
+    "fmla z31.s, p3/M, z3.s, z5.s\n"
+    "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "fmla z29.s, p3/M, z3.s, z9.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla z28.s, p3/M, z3.s, z13.s\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla z31.s, p3/M, z4.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "addvl x16, x16, #16\n"
+    "fmla z28.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z31.s, p3/M, z0.s, z14.s\n"
+    "ldr x19, [x15, #0xa8]\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "fmla z29.s, p3/M, z0.s, z5.s\n"
+    "ldr x22, [x15, #0xb0]\n"
+    "fmla z28.s, p3/M, z0.s, z6.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla z29.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "ldr x21, [x15, #0xb8]\n"
+    "fmla z31.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #-6, MUL VL]\n"
+    "fmla z29.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z14.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z30.s, p3/M, z3.s, z13.s\n"
+    "ldr x19, [x15, #0xc8]\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z1.s }, p3/Z, [x16, #-5, MUL VL]\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z30.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z28.s, p3/M, z4.s, z14.s\n"
+    "ldr x21, [x15, #0xd0]\n"
+    "fmla z31.s, p3/M, z0.s, z5.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #-4, MUL VL]\n"
+    "fmla z30.s, p3/M, z0.s, z6.s\n"
+    "ldr x19, [x15, #0xd8]\n"
+    "fmla z29.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z0.s, z13.s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla z31.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z10.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #-3, MUL VL]\n"
+    "fmla z28.s, p3/M, z1.s, z5.s\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z4.s }, p3/Z, [x16, #-2, MUL VL]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ldr x19, [x15, #0xe8]\n"
+    "fmla z29.s, p3/M, z2.s, z5.s\n"
+    "ld1w { z8.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z6.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #-1, MUL VL]\n"
+    "fmla z28.s, p3/M, z3.s, z8.s\n"
+    "ldr x20, [x15, #0xf0]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "ldr x19, [x15, #0xf8]\n"
+    "fmla z30.s, p3/M, z4.s, z14.s\n"
+    "ld1w { z1.s }, p3/Z, [x16]\n"
+    "fmla z29.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ldr x26, [x15, #0x100]\n"
+    "fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "ldr x25, [x15, #0x108]\n"
+    "fmla z30.s, p3/M, z1.s, z5.s\n"
+    "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z28.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z5.s\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "fmla z30.s, p3/M, z2.s, z6.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ldr x24, [x15, #0x110]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ldr x23, [x15, #0x118]\n"
+    "fmla z31.s, p3/M, z3.s, z6.s\n"
+    "fmla z30.s, p3/M, z3.s, z8.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "incw x13\n"
+    "fmla z31.s, p3/M, z4.s, z8.s\n"
+    "ldp x26, x25, [x15, #0x0]\n"
+    "whilelt p2.s, x13, %x[n_channels]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "ldp x24, x23, [x15, #0x10]\n"
+    "addvl x14, x14, #1\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "ldp x20, x19, [x15, #0x20]\n"
+    "ldp x22, x21, [x15, #0x30]\n"
+    "fmla z28.s, p3/M, z4.s, z9.s\n"
+    "ld1w { z5.s }, p1/Z, [x26, x10, LSL #2]\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "ld1w { z6.s }, p1/Z, [x25, x10, LSL #2]\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "ld1w { z7.s }, p1/Z, [x24, x10, LSL #2]\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "ld1w { z8.s }, p1/Z, [x23, x10, LSL #2]\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "ld1w { z9.s }, p1/Z, [x20, x10, LSL #2]\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "ld1w { z13.s }, p1/Z, [x19, x10, LSL #2]\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "ld1w { z11.s }, p1/Z, [x22, x10, LSL #2]\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "ld1w { z12.s }, p1/Z, [x21, x10, LSL #2]\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "ldp x20, x19, [x15, #0x40]\n"
+    "st1w { z31.s }, p0, [x12, x27, LSL #2]\n"
+    "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+    "ld1w { z10.s }, p1/Z, [x20, x10, LSL #2]\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "ld1w { z14.s }, p1/Z, [x19, x10, LSL #2]\n"
+    "incw x10\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "cmp x10, %x[n_channels]\n"
+    "st1w { z29.s }, p0, [x9, x27, LSL #2]\n"
+    "st1w { z28.s }, p0, [x28, x27, LSL #2]\n"
+    "ld1w { z16.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "mov z31.d, z16.d\n"
+    "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "mov z30.d, z16.d\n"
+    "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "mov z29.d, z16.d\n"
+    "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "addvl x16, x16, #16\n"
+    "mov z28.d, z16.d\n"
+    "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "addvl x16, x16, #-6\n"
+    "blt 1b\n"
+    "2:"  // Channel tail
+    "fmla z31.s, p3/M, z0.s, z5.s\n"
+    "ldr x21, [x15, #0x50]\n"
+    "incw x27\n"
+    "fmla z30.s, p3/M, z0.s, z6.s\n"
+    "ldr x19, [x15, #0x58]\n"
+    "mov p0.b, p2.b\n"
+    "fmla z29.s, p3/M, z0.s, z7.s\n"
+    "ldr x20, [x15, #0x60]\n"
+    "fmla z28.s, p3/M, z0.s, z8.s\n"
+    "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla z31.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z9.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z1.s, z8.s\n"
+    "fmla z28.s, p3/M, z1.s, z13.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "ld1w { z0.s }, p3/Z, [x16]\n"
+    "fmla z31.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ldr x19, [x15, #0x68]\n"
+    "fmla z29.s, p3/M, z2.s, z13.s\n"
+    "fmla z28.s, p3/M, z2.s, z5.s\n"
+    "ld1w { z1.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "ldr x20, [x15, #0x70]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z3.s, z5.s\n"
+    "fmla z28.s, p3/M, z3.s, z6.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "ld1w { z2.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z9.s\n"
+    "ldr x19, [x15, #0x78]\n"
+    "fmla z29.s, p3/M, z4.s, z6.s\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "ldr x26, [x15, #0x80]\n"
+    "fmla z31.s, p3/M, z0.s, z7.s\n"
+    "ld1w { z9.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z8.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z0.s, z14.s\n"
+    "fmla z28.s, p3/M, z0.s, z11.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #4, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z8.s\n"
+    "ldr x25, [x15, #0x88]\n"
+    "fmla z30.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n"
+    "fmla z29.s, p3/M, z1.s, z11.s\n"
+    "fmla z28.s, p3/M, z1.s, z12.s\n"
+    "ld1w { z8.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "fmla z31.s, p3/M, z2.s, z13.s\n"
+    "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z5.s\n"
+    "ldr x24, [x15, #0x90]\n"
+    "fmla z29.s, p3/M, z2.s, z12.s\n"
+    "fmla z28.s, p3/M, z2.s, z9.s\n"
+    "ldr x23, [x15, #0x98]\n"
+    "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n"
+    "fmla z31.s, p3/M, z3.s, z5.s\n"
+    "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z6.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "fmla z29.s, p3/M, z3.s, z9.s\n"
+    "fmla z28.s, p3/M, z3.s, z13.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "ldr x20, [x15, #0xa0]\n"
+    "fmla z31.s, p3/M, z4.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n"
+    "fmla z29.s, p3/M, z4.s, z13.s\n"
+    "addvl x16, x16, #16\n"
+    "fmla z28.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z31.s, p3/M, z0.s, z14.s\n"
+    "ldr x19, [x15, #0xa8]\n"
+    "fmla z30.s, p3/M, z0.s, z11.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n"
+    "fmla z29.s, p3/M, z0.s, z5.s\n"
+    "ldr x22, [x15, #0xb0]\n"
+    "fmla z28.s, p3/M, z0.s, z6.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z31.s, p3/M, z1.s, z11.s\n"
+    "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x22, x14]\n"
+    "fmla z29.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n"
+    "fmla z28.s, p3/M, z1.s, z10.s\n"
+    "ldr x21, [x15, #0xb8]\n"
+    "fmla z31.s, p3/M, z2.s, z12.s\n"
+    "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z2.s, z9.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #-6, MUL VL]\n"
+    "fmla z29.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z14.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla z31.s, p3/M, z3.s, z9.s\n"
+    "ldr x20, [x15, #0xc0]\n"
+    "fmla z30.s, p3/M, z3.s, z13.s\n"
+    "ldr x19, [x15, #0xc8]\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z1.s }, p3/Z, [x16, #-5, MUL VL]\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z4.s, z13.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z30.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z28.s, p3/M, z4.s, z14.s\n"
+    "ldr x21, [x15, #0xd0]\n"
+    "fmla z31.s, p3/M, z0.s, z5.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #-4, MUL VL]\n"
+    "fmla z30.s, p3/M, z0.s, z6.s\n"
+    "ldr x19, [x15, #0xd8]\n"
+    "fmla z29.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z0.s, z13.s\n"
+    "prfm pldl1keep, [x21, x14]\n"
+    "fmla z31.s, p3/M, z1.s, z6.s\n"
+    "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z1.s, z10.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z1.s, z13.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #-3, MUL VL]\n"
+    "fmla z28.s, p3/M, z1.s, z5.s\n"
+    "ldr x20, [x15, #0xe0]\n"
+    "fmla z31.s, p3/M, z2.s, z10.s\n"
+    "ld1w { z4.s }, p3/Z, [x16, #-2, MUL VL]\n"
+    "fmla z30.s, p3/M, z2.s, z11.s\n"
+    "ldr x19, [x15, #0xe8]\n"
+    "fmla z29.s, p3/M, z2.s, z5.s\n"
+    "ld1w { z8.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z2.s, z6.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z31.s, p3/M, z3.s, z11.s\n"
+    "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z3.s, z6.s\n"
+    "ld1w { z0.s }, p3/Z, [x16, #-1, MUL VL]\n"
+    "fmla z28.s, p3/M, z3.s, z8.s\n"
+    "ldr x20, [x15, #0xf0]\n"
+    "fmla z31.s, p3/M, z4.s, z12.s\n"
+    "ldr x19, [x15, #0xf8]\n"
+    "fmla z30.s, p3/M, z4.s, z14.s\n"
+    "ld1w { z1.s }, p3/Z, [x16]\n"
+    "fmla z29.s, p3/M, z4.s, z8.s\n"
+    "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z4.s, z10.s\n"
+    "prfm pldl1keep, [x20, x14]\n"
+    "fmla z31.s, p3/M, z0.s, z9.s\n"
+    "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n"
+    "fmla z30.s, p3/M, z0.s, z13.s\n"
+    "prfm pldl1keep, [x19, x14]\n"
+    "fmla z29.s, p3/M, z0.s, z11.s\n"
+    "ldr x26, [x15, #0x100]\n"
+    "fmla z28.s, p3/M, z0.s, z12.s\n"
+    "ld1w { z2.s }, p3/Z, [x16, #1, MUL VL]\n"
+    "fmla z31.s, p3/M, z1.s, z13.s\n"
+    "ldr x25, [x15, #0x108]\n"
+    "fmla z30.s, p3/M, z1.s, z5.s\n"
+    "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z1.s, z12.s\n"
+    "prfm pldl1keep, [x26, x14]\n"
+    "fmla z28.s, p3/M, z1.s, z9.s\n"
+    "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n"
+    "fmla z31.s, p3/M, z2.s, z5.s\n"
+    "prfm pldl1keep, [x25, x14]\n"
+    "fmla z30.s, p3/M, z2.s, z6.s\n"
+    "ld1w { z3.s }, p3/Z, [x16, #2, MUL VL]\n"
+    "fmla z29.s, p3/M, z2.s, z9.s\n"
+    "ldr x24, [x15, #0x110]\n"
+    "ld1w { z4.s }, p3/Z, [x16, #3, MUL VL]\n"
+    "fmla z28.s, p3/M, z2.s, z11.s\n"
+    "ldr x23, [x15, #0x118]\n"
+    "fmla z31.s, p3/M, z3.s, z6.s\n"
+    "fmla z30.s, p3/M, z3.s, z8.s\n"
+    "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n"
+    "fmla z29.s, p3/M, z3.s, z11.s\n"
+    "prfm pldl1keep, [x24, x14]\n"
+    "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n"
+    "fmla z28.s, p3/M, z3.s, z12.s\n"
+    "prfm pldl1keep, [x23, x14]\n"
+    "fmla z31.s, p3/M, z4.s, z8.s\n"
+    "fmla z30.s, p3/M, z4.s, z10.s\n"
+    "fmla z29.s, p3/M, z4.s, z12.s\n"
+    "fmla z28.s, p3/M, z4.s, z9.s\n"
+    "fmax z31.s, p3/M, z31.s, z18.s\n"
+    "fmax z30.s, p3/M, z30.s, z18.s\n"
+    "fmax z29.s, p3/M, z29.s, z18.s\n"
+    "fmin z31.s, p3/M, z31.s, z17.s\n"
+    "st1w { z31.s }, p0, [x12, x27, LSL #2]\n"
+    "fmin z30.s, p3/M, z30.s, z17.s\n"
+    "fmin z29.s, p3/M, z29.s, z17.s\n"
+    "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+    "fmax z28.s, p3/M, z28.s, z18.s\n"
+    "st1w { z29.s }, p0, [x9, x27, LSL #2]\n"
+    "fmin z28.s, p3/M, z28.s, z17.s\n"
+    "st1w { z28.s }, p0, [x28, x27, LSL #2]\n"
+    :
+    : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000..dd2c519
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+struct sve_fp32_nhwc_generic_output9_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int n_output_points = 9;
+
+  kern_type kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+  sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..370218e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp

@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const void *bias,
+  const unsigned int n_points,
+  const unsigned int n_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ptrue p1.b\n"
+    "ld1rw { z4.s }, p1/Z, [%x[minmax_vals]]\n"
+    "mov x28, #0x0\n"
+    "ld1rw { z3.s }, p1/Z, [%x[minmax_vals], #4]\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "1:"  // Channel loop
+    "mov z2.b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ld1w { z2.s }, p0/Z, [%x[bias], x28, LSL #2]\n"
+    "2:"  // Channel loop: Load bias: Done
+    "mov z1.d, z2.d\n"
+    "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+    "mov x22, %x[inptrs]\n"
+    "mov z31.d, z2.d\n"
+    "ldp x20, x19, [x22], #0x10\n"
+    "subs x21, %x[n_points], #0x1\n"
+    "mov z30.d, z2.d\n"
+    "ld1w { z29.s }, p0/Z, [x20, x28, LSL #2]\n"
+    "mov z28.d, z2.d\n"
+    "addvl %x[params], %x[params], #1\n"
+    "mov z27.d, z2.d\n"
+    "ld1w { z26.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "mov z25.d, z2.d\n"
+    "ldp x20, x19, [x22], #0x10\n"
+    "mov z24.d, z2.d\n"
+    "ld1w { z23.s }, p0/Z, [x20, x28, LSL #2]\n"
+    "mov z22.d, z2.d\n"
+    "ld1w { z21.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "ldp x20, x19, [x22], #0x10\n"
+    "ld1w { z20.s }, p0/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z19.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "ldp x20, x19, [x22], #0x10\n"
+    "ld1w { z18.s }, p0/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z17.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "ldr x19, [x22], #0x8\n"
+    "ld1w { z16.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "ble 4f\n"
+    "3:"  // Channel loop: Planar loop
+    "fmla z2.s, p1/M, z29.s, z0.s\n"
+    "ldp x20, x19, [x22], #0x10\n"
+    "subs x21, x21, #0x1\n"
+    "fmla z1.s, p1/M, z26.s, z0.s\n"
+    "ld1w { z29.s }, p0/Z, [x20, x28, LSL #2]\n"
+    "fmla z31.s, p1/M, z23.s, z0.s\n"
+    "fmla z30.s, p1/M, z21.s, z0.s\n"
+    "ld1w { z26.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "fmla z28.s, p1/M, z20.s, z0.s\n"
+    "ldp x20, x19, [x22], #0x10\n"
+    "fmla z27.s, p1/M, z19.s, z0.s\n"
+    "ld1w { z23.s }, p0/Z, [x20, x28, LSL #2]\n"
+    "fmla z25.s, p1/M, z18.s, z0.s\n"
+    "fmla z24.s, p1/M, z17.s, z0.s\n"
+    "ld1w { z21.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "fmla z22.s, p1/M, z16.s, z0.s\n"
+    "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+    "addvl %x[params], %x[params], #1\n"
+    "ldp x20, x19, [x22], #0x10\n"
+    "ld1w { z20.s }, p0/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z19.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "ldp x20, x19, [x22], #0x10\n"
+    "ld1w { z18.s }, p0/Z, [x20, x28, LSL #2]\n"
+    "ld1w { z17.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "ldr x19, [x22], #0x8\n"
+    "ld1w { z16.s }, p0/Z, [x19, x28, LSL #2]\n"
+    "bgt 3b\n"
+    "4:"  // Channel loop: Planar tail
+    "fmla z2.s, p1/M, z29.s, z0.s\n"
+    "ldp x27, x26, [%x[outptrs], #0x0]\n"
+    "fmla z1.s, p1/M, z26.s, z0.s\n"
+    "ldp x25, x24, [%x[outptrs], #0x10]\n"
+    "fmla z31.s, p1/M, z23.s, z0.s\n"
+    "ldp x23, x22, [%x[outptrs], #0x20]\n"
+    "fmla z30.s, p1/M, z21.s, z0.s\n"
+    "ldp x21, x20, [%x[outptrs], #0x30]\n"
+    "fmla z28.s, p1/M, z20.s, z0.s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmla z27.s, p1/M, z19.s, z0.s\n"
+    "fmla z25.s, p1/M, z18.s, z0.s\n"
+    "fmla z24.s, p1/M, z17.s, z0.s\n"
+    "fmla z22.s, p1/M, z16.s, z0.s\n"
+    "fmax z2.s, p1/M, z2.s, z4.s\n"
+    "fmax z1.s, p1/M, z1.s, z4.s\n"
+    "fmax z31.s, p1/M, z31.s, z4.s\n"
+    "fmax z30.s, p1/M, z30.s, z4.s\n"
+    "fmin z2.s, p1/M, z2.s, z3.s\n"
+    "st1w { z2.s }, p0, [x27, x28, LSL #2]\n"
+    "fmin z1.s, p1/M, z1.s, z3.s\n"
+    "fmin z31.s, p1/M, z31.s, z3.s\n"
+    "st1w { z1.s }, p0, [x26, x28, LSL #2]\n"
+    "fmin z30.s, p1/M, z30.s, z3.s\n"
+    "fmax z28.s, p1/M, z28.s, z4.s\n"
+    "st1w { z31.s }, p0, [x25, x28, LSL #2]\n"
+    "fmax z27.s, p1/M, z27.s, z4.s\n"
+    "st1w { z30.s }, p0, [x24, x28, LSL #2]\n"
+    "fmin z28.s, p1/M, z28.s, z3.s\n"
+    "fmax z25.s, p1/M, z25.s, z4.s\n"
+    "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
+    "fmin z27.s, p1/M, z27.s, z3.s\n"
+    "fmin z25.s, p1/M, z25.s, z3.s\n"
+    "st1w { z27.s }, p0, [x22, x28, LSL #2]\n"
+    "fmax z24.s, p1/M, z24.s, z4.s\n"
+    "fmax z22.s, p1/M, z22.s, z4.s\n"
+    "st1w { z25.s }, p0, [x21, x28, LSL #2]\n"
+    "fmin z24.s, p1/M, z24.s, z3.s\n"
+    "st1w { z24.s }, p0, [x20, x28, LSL #2]\n"
+    "fmin z22.s, p1/M, z22.s, z3.s\n"
+    "st1w { z22.s }, p0, [x19, x28, LSL #2]\n"
+    "incw x28\n"
+    "whilelt p0.s, x28, %x[n_channels]\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000..5cf3314
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 3;
+  constexpr static unsigned int output_cols = 3;
+
+  constexpr static unsigned int input_rows = 7;
+  constexpr static unsigned int input_cols = 7;
+  constexpr static unsigned int input_col_quads = 2;
+
+  kern_type kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+
+  sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..ce640a2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp

@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ldp x12, x11, [%x[outptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "ldp x10, x9, [%x[outptrs], #0x10]\n"
+    "mov x28, #0x0\n"
+    "ldp x27, x26, [%x[outptrs], #0x20]\n"
+    "mov x25, #0x0\n"
+    "ldp x24, x23, [%x[outptrs], #0x30]\n"
+    "whilelt p1.s, x28, %x[channel_multiplier]\n"
+    "ldr x22, [%x[outptrs], #0x40]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ldr x19, [%x[inptrs], #0x10]\n"
+    "ld1rqw { z2.s }, p2/Z, [x21]\n"
+    "ld1rqw { z3.s }, p2/Z, [x21, #16]\n"
+    "ld1rqw { z4.s }, p2/Z, [x20]\n"
+    "ld1rqw { z5.s }, p2/Z, [x20, #16]\n"
+    "ld1rqw { z6.s }, p2/Z, [x19]\n"
+    "ld1rqw { z7.s }, p2/Z, [x19, #16]\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ldr x19, [%x[inptrs], #0x28]\n"
+    "ld1rqw { z8.s }, p2/Z, [x21]\n"
+    "ld1rqw { z9.s }, p2/Z, [x21, #16]\n"
+    "ld1rqw { z10.s }, p2/Z, [x20]\n"
+    "ld1rqw { z11.s }, p2/Z, [x20, #16]\n"
+    "ld1rqw { z12.s }, p2/Z, [x19]\n"
+    "ld1rqw { z13.s }, p2/Z, [x19, #16]\n"
+    "ldr x19, [%x[inptrs], #0x30]\n"
+    "ld1rw { z26.s }, p2/Z, [%x[clamps]]\n"
+    "ld1rw { z25.s }, p2/Z, [%x[clamps], #4]\n"
+    "ld1rqw { z14.s }, p2/Z, [x19]\n"
+    "ld1rqw { z15.s }, p2/Z, [x19, #16]\n"
+    "ld1w { z24.s }, p1/Z, [%x[params]]\n"
+    "mov z23.d, z24.d\n"
+    "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "mov z22.d, z24.d\n"
+    "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "mov z21.d, z24.d\n"
+    "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "addvl %x[params], %x[params], #4\n"
+    "mov z20.d, z24.d\n"
+    "mov z19.d, z24.d\n"
+    "mov z18.d, z24.d\n"
+    "mov z17.d, z24.d\n"
+    "mov z16.d, z24.d\n"
+    "1:"  // Output channel complete vector loop
+    "mov z0.d, z10.d\n"
+    "mov p0.b, p1.b\n"
+    "mov z1.d, z11.d\n"
+    "incw x28\n"
+    "fmla z24.s, z31.s, z2.s[0]\n"
+    "whilelt p1.s, x28, %x[channel_multiplier]\n"
+    "fmla z23.s, z31.s, z2.s[2]\n"
+    "fmla z22.s, z31.s, z3.s[0]\n"
+    "fmla z21.s, z31.s, z6.s[0]\n"
+    "fmla z20.s, z31.s, z6.s[2]\n"
+    "fmla z19.s, z31.s, z7.s[0]\n"
+    "fmla z18.s, z31.s, z0.s[0]\n"
+    "fmla z17.s, z31.s, z0.s[2]\n"
+    "fmla z16.s, z31.s, z1.s[0]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params]]\n"
+    "fmla z24.s, z30.s, z2.s[1]\n"
+    "fmla z23.s, z30.s, z2.s[3]\n"
+    "fmla z22.s, z30.s, z3.s[1]\n"
+    "fmla z21.s, z30.s, z6.s[1]\n"
+    "fmla z20.s, z30.s, z6.s[3]\n"
+    "fmla z19.s, z30.s, z7.s[1]\n"
+    "fmla z18.s, z30.s, z0.s[1]\n"
+    "fmla z17.s, z30.s, z0.s[3]\n"
+    "fmla z16.s, z30.s, z1.s[1]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "fmla z24.s, z29.s, z2.s[2]\n"
+    "fmla z23.s, z29.s, z3.s[0]\n"
+    "fmla z22.s, z29.s, z3.s[2]\n"
+    "fmla z21.s, z29.s, z6.s[2]\n"
+    "fmla z20.s, z29.s, z7.s[0]\n"
+    "fmla z19.s, z29.s, z7.s[2]\n"
+    "fmla z18.s, z29.s, z0.s[2]\n"
+    "mov z0.d, z8.d\n"
+    "fmla z17.s, z29.s, z1.s[0]\n"
+    "fmla z16.s, z29.s, z1.s[2]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "mov z1.d, z9.d\n"
+    "fmla z24.s, z31.s, z4.s[0]\n"
+    "fmla z23.s, z31.s, z4.s[2]\n"
+    "fmla z22.s, z31.s, z5.s[0]\n"
+    "fmla z21.s, z31.s, z0.s[0]\n"
+    "fmla z20.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z19.s, z31.s, z1.s[0]\n"
+    "mov z1.d, z13.d\n"
+    "fmla z18.s, z31.s, z0.s[0]\n"
+    "fmla z17.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z8.d\n"
+    "fmla z16.s, z31.s, z1.s[0]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "mov z1.d, z9.d\n"
+    "fmla z24.s, z30.s, z4.s[1]\n"
+    "fmla z23.s, z30.s, z4.s[3]\n"
+    "fmla z22.s, z30.s, z5.s[1]\n"
+    "fmla z21.s, z30.s, z0.s[1]\n"
+    "fmla z20.s, z30.s, z0.s[3]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z19.s, z30.s, z1.s[1]\n"
+    "mov z1.d, z13.d\n"
+    "fmla z18.s, z30.s, z0.s[1]\n"
+    "fmla z17.s, z30.s, z0.s[3]\n"
+    "mov z0.d, z8.d\n"
+    "fmla z16.s, z30.s, z1.s[1]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "mov z1.d, z9.d\n"
+    "fmla z24.s, z29.s, z4.s[2]\n"
+    "fmla z23.s, z29.s, z5.s[0]\n"
+    "fmla z22.s, z29.s, z5.s[2]\n"
+    "fmla z21.s, z29.s, z0.s[2]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z20.s, z29.s, z1.s[0]\n"
+    "fmla z19.s, z29.s, z1.s[2]\n"
+    "mov z1.d, z13.d\n"
+    "fmla z18.s, z29.s, z0.s[2]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z17.s, z29.s, z1.s[0]\n"
+    "fmla z16.s, z29.s, z1.s[2]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z24.s, z31.s, z6.s[0]\n"
+    "fmla z23.s, z31.s, z6.s[2]\n"
+    "fmla z22.s, z31.s, z7.s[0]\n"
+    "fmla z21.s, z31.s, z0.s[0]\n"
+    "fmla z20.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z14.d\n"
+    "fmla z19.s, z31.s, z1.s[0]\n"
+    "mov z1.d, z15.d\n"
+    "fmla z18.s, z31.s, z0.s[0]\n"
+    "fmla z17.s, z31.s, z0.s[2]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z16.s, z31.s, z1.s[0]\n"
+    "ld1w { z31.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z24.s, z30.s, z6.s[1]\n"
+    "fmla z23.s, z30.s, z6.s[3]\n"
+    "fmla z22.s, z30.s, z7.s[1]\n"
+    "fmla z21.s, z30.s, z0.s[1]\n"
+    "fmla z20.s, z30.s, z0.s[3]\n"
+    "mov z0.d, z14.d\n"
+    "fmla z19.s, z30.s, z1.s[1]\n"
+    "mov z1.d, z15.d\n"
+    "fmla z18.s, z30.s, z0.s[1]\n"
+    "fmla z17.s, z30.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z16.s, z30.s, z1.s[1]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z24.s, z29.s, z6.s[2]\n"
+    "fmla z23.s, z29.s, z7.s[0]\n"
+    "fmla z22.s, z29.s, z7.s[2]\n"
+    "fmla z21.s, z29.s, z0.s[2]\n"
+    "mov z0.d, z14.d\n"
+    "fmla z20.s, z29.s, z1.s[0]\n"
+    "fmla z19.s, z29.s, z1.s[2]\n"
+    "mov z1.d, z15.d\n"
+    "fmla z18.s, z29.s, z0.s[2]\n"
+    "fmla z17.s, z29.s, z1.s[0]\n"
+    "fmla z16.s, z29.s, z1.s[2]\n"
+    "fmin z24.s, p2/M, z24.s, z25.s\n"
+    "fmin z23.s, p2/M, z23.s, z25.s\n"
+    "fmin z22.s, p2/M, z22.s, z25.s\n"
+    "fmin z21.s, p2/M, z21.s, z25.s\n"
+    "fmax z24.s, p2/M, z24.s, z26.s\n"
+    "st1w { z24.s }, p0, [x12, x25, LSL #2]\n"
+    "fmax z23.s, p2/M, z23.s, z26.s\n"
+    "fmax z22.s, p2/M, z22.s, z26.s\n"
+    "ld1w { z24.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmax z21.s, p2/M, z21.s, z26.s\n"
+    "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+    "fmin z20.s, p2/M, z20.s, z25.s\n"
+    "ld1w { z29.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "fmin z19.s, p2/M, z19.s, z25.s\n"
+    "st1w { z23.s }, p0, [x11, x25, LSL #2]\n"
+    "mov z23.d, z24.d\n"
+    "st1w { z22.s }, p0, [x10, x25, LSL #2]\n"
+    "mov z22.d, z24.d\n"
+    "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
+    "mov z21.d, z24.d\n"
+    "fmax z20.s, p2/M, z20.s, z26.s\n"
+    "st1w { z20.s }, p0, [x27, x25, LSL #2]\n"
+    "mov z20.d, z24.d\n"
+    "fmax z19.s, p2/M, z19.s, z26.s\n"
+    "st1w { z19.s }, p0, [x26, x25, LSL #2]\n"
+    "mov z19.d, z24.d\n"
+    "fmin z18.s, p2/M, z18.s, z25.s\n"
+    "fmin z17.s, p2/M, z17.s, z25.s\n"
+    "fmin z16.s, p2/M, z16.s, z25.s\n"
+    "fmax z18.s, p2/M, z18.s, z26.s\n"
+    "st1w { z18.s }, p0, [x24, x25, LSL #2]\n"
+    "mov z18.d, z24.d\n"
+    "fmax z17.s, p2/M, z17.s, z26.s\n"
+    "st1w { z17.s }, p0, [x23, x25, LSL #2]\n"
+    "mov z17.d, z24.d\n"
+    "fmax z16.s, p2/M, z16.s, z26.s\n"
+    "st1w { z16.s }, p0, [x22, x25, LSL #2]\n"
+    "mov z16.d, z24.d\n"
+    "incw x25\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000..3c2f771
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 8;
+  constexpr static unsigned int input_col_quads = 2;
+
+  kern_type kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+
+  sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..453b00c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp

@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ldp x11, x10, [%x[outptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "ldp x9, x28, [%x[outptrs], #0x10]\n"
+    "mov x27, #0x0\n"
+    "ldp x26, x25, [%x[outptrs], #0x20]\n"
+    "mov x24, #0x0\n"
+    "ldp x23, x22, [%x[outptrs], #0x30]\n"
+    "whilelt p1.s, x27, %x[channel_multiplier]\n"
+    "ldr x21, [%x[inptrs], #0x0]\n"
+    "ldr x20, [%x[inptrs], #0x8]\n"
+    "ldr x19, [%x[inptrs], #0x10]\n"
+    "ld1rqw { z2.s }, p2/Z, [x21]\n"
+    "ld1rqw { z3.s }, p2/Z, [x21, #16]\n"
+    "ld1rqw { z4.s }, p2/Z, [x20]\n"
+    "ld1rqw { z5.s }, p2/Z, [x20, #16]\n"
+    "ld1rqw { z6.s }, p2/Z, [x19]\n"
+    "ld1rqw { z7.s }, p2/Z, [x19, #16]\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "ldr x19, [%x[inptrs], #0x28]\n"
+    "ld1rqw { z8.s }, p2/Z, [x21]\n"
+    "ld1rqw { z9.s }, p2/Z, [x21, #16]\n"
+    "ld1rqw { z10.s }, p2/Z, [x20]\n"
+    "ld1rqw { z11.s }, p2/Z, [x20, #16]\n"
+    "ld1rqw { z12.s }, p2/Z, [x19]\n"
+    "ld1rqw { z13.s }, p2/Z, [x19, #16]\n"
+    "ld1rw { z25.s }, p2/Z, [%x[clamps]]\n"
+    "ld1rw { z24.s }, p2/Z, [%x[clamps], #4]\n"
+    "ld1w { z23.s }, p1/Z, [%x[params]]\n"
+    "mov z22.d, z23.d\n"
+    "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "mov z21.d, z23.d\n"
+    "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "mov z20.d, z23.d\n"
+    "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "mov z19.d, z23.d\n"
+    "ld1w { z28.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "mov z18.d, z23.d\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #6\n"
+    "mov z17.d, z23.d\n"
+    "mov z16.d, z23.d\n"
+    "1:"  // Output channel complete vector loop
+    "mov z0.d, z8.d\n"
+    "mov p0.b, p1.b\n"
+    "mov z1.d, z9.d\n"
+    "incw x27\n"
+    "fmla z23.s, z31.s, z2.s[0]\n"
+    "whilelt p1.s, x27, %x[channel_multiplier]\n"
+    "fmla z22.s, z31.s, z2.s[1]\n"
+    "fmla z21.s, z31.s, z2.s[2]\n"
+    "fmla z20.s, z31.s, z2.s[3]\n"
+    "fmla z19.s, z31.s, z4.s[0]\n"
+    "fmla z18.s, z31.s, z4.s[1]\n"
+    "fmla z17.s, z31.s, z4.s[2]\n"
+    "fmla z16.s, z31.s, z4.s[3]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params]]\n"
+    "fmla z23.s, z30.s, z2.s[1]\n"
+    "fmla z22.s, z30.s, z2.s[2]\n"
+    "fmla z21.s, z30.s, z2.s[3]\n"
+    "fmla z20.s, z30.s, z3.s[0]\n"
+    "fmla z19.s, z30.s, z4.s[1]\n"
+    "fmla z18.s, z30.s, z4.s[2]\n"
+    "fmla z17.s, z30.s, z4.s[3]\n"
+    "fmla z16.s, z30.s, z5.s[0]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "fmla z23.s, z29.s, z2.s[2]\n"
+    "fmla z22.s, z29.s, z2.s[3]\n"
+    "fmla z21.s, z29.s, z3.s[0]\n"
+    "fmla z20.s, z29.s, z3.s[1]\n"
+    "fmla z19.s, z29.s, z4.s[2]\n"
+    "fmla z18.s, z29.s, z4.s[3]\n"
+    "fmla z17.s, z29.s, z5.s[0]\n"
+    "fmla z16.s, z29.s, z5.s[1]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "fmla z23.s, z28.s, z2.s[3]\n"
+    "fmla z22.s, z28.s, z3.s[0]\n"
+    "fmla z21.s, z28.s, z3.s[1]\n"
+    "fmla z20.s, z28.s, z3.s[2]\n"
+    "fmla z19.s, z28.s, z4.s[3]\n"
+    "fmla z18.s, z28.s, z5.s[0]\n"
+    "fmla z17.s, z28.s, z5.s[1]\n"
+    "fmla z16.s, z28.s, z5.s[2]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "fmla z23.s, z27.s, z3.s[0]\n"
+    "fmla z22.s, z27.s, z3.s[1]\n"
+    "fmla z21.s, z27.s, z3.s[2]\n"
+    "fmla z20.s, z27.s, z3.s[3]\n"
+    "fmla z19.s, z27.s, z5.s[0]\n"
+    "fmla z18.s, z27.s, z5.s[1]\n"
+    "fmla z17.s, z27.s, z5.s[2]\n"
+    "fmla z16.s, z27.s, z5.s[3]\n"
+    "ld1w { z27.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "fmla z23.s, z31.s, z4.s[0]\n"
+    "fmla z22.s, z31.s, z4.s[1]\n"
+    "fmla z21.s, z31.s, z4.s[2]\n"
+    "fmla z20.s, z31.s, z4.s[3]\n"
+    "fmla z19.s, z31.s, z6.s[0]\n"
+    "fmla z18.s, z31.s, z6.s[1]\n"
+    "fmla z17.s, z31.s, z6.s[2]\n"
+    "fmla z16.s, z31.s, z6.s[3]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "fmla z23.s, z30.s, z4.s[1]\n"
+    "fmla z22.s, z30.s, z4.s[2]\n"
+    "fmla z21.s, z30.s, z4.s[3]\n"
+    "fmla z20.s, z30.s, z5.s[0]\n"
+    "fmla z19.s, z30.s, z6.s[1]\n"
+    "fmla z18.s, z30.s, z6.s[2]\n"
+    "fmla z17.s, z30.s, z6.s[3]\n"
+    "fmla z16.s, z30.s, z7.s[0]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "fmla z23.s, z29.s, z4.s[2]\n"
+    "fmla z22.s, z29.s, z4.s[3]\n"
+    "fmla z21.s, z29.s, z5.s[0]\n"
+    "fmla z20.s, z29.s, z5.s[1]\n"
+    "fmla z19.s, z29.s, z6.s[2]\n"
+    "fmla z18.s, z29.s, z6.s[3]\n"
+    "fmla z17.s, z29.s, z7.s[0]\n"
+    "fmla z16.s, z29.s, z7.s[1]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmla z23.s, z28.s, z4.s[3]\n"
+    "fmla z22.s, z28.s, z5.s[0]\n"
+    "fmla z21.s, z28.s, z5.s[1]\n"
+    "fmla z20.s, z28.s, z5.s[2]\n"
+    "fmla z19.s, z28.s, z6.s[3]\n"
+    "fmla z18.s, z28.s, z7.s[0]\n"
+    "fmla z17.s, z28.s, z7.s[1]\n"
+    "fmla z16.s, z28.s, z7.s[2]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "fmla z23.s, z27.s, z5.s[0]\n"
+    "fmla z22.s, z27.s, z5.s[1]\n"
+    "fmla z21.s, z27.s, z5.s[2]\n"
+    "fmla z20.s, z27.s, z5.s[3]\n"
+    "fmla z19.s, z27.s, z7.s[0]\n"
+    "fmla z18.s, z27.s, z7.s[1]\n"
+    "fmla z17.s, z27.s, z7.s[2]\n"
+    "fmla z16.s, z27.s, z7.s[3]\n"
+    "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "fmla z23.s, z31.s, z6.s[0]\n"
+    "fmla z22.s, z31.s, z6.s[1]\n"
+    "fmla z21.s, z31.s, z6.s[2]\n"
+    "fmla z20.s, z31.s, z6.s[3]\n"
+    "fmla z19.s, z31.s, z0.s[0]\n"
+    "fmla z18.s, z31.s, z0.s[1]\n"
+    "fmla z17.s, z31.s, z0.s[2]\n"
+    "fmla z16.s, z31.s, z0.s[3]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    "fmla z23.s, z30.s, z6.s[1]\n"
+    "fmla z22.s, z30.s, z6.s[2]\n"
+    "fmla z21.s, z30.s, z6.s[3]\n"
+    "fmla z20.s, z30.s, z7.s[0]\n"
+    "fmla z19.s, z30.s, z0.s[1]\n"
+    "fmla z18.s, z30.s, z0.s[2]\n"
+    "fmla z17.s, z30.s, z0.s[3]\n"
+    "fmla z16.s, z30.s, z1.s[0]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "fmla z23.s, z29.s, z6.s[2]\n"
+    "fmla z22.s, z29.s, z6.s[3]\n"
+    "fmla z21.s, z29.s, z7.s[0]\n"
+    "fmla z20.s, z29.s, z7.s[1]\n"
+    "fmla z19.s, z29.s, z0.s[2]\n"
+    "fmla z18.s, z29.s, z0.s[3]\n"
+    "fmla z17.s, z29.s, z1.s[0]\n"
+    "fmla z16.s, z29.s, z1.s[1]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "fmla z23.s, z28.s, z6.s[3]\n"
+    "fmla z22.s, z28.s, z7.s[0]\n"
+    "fmla z21.s, z28.s, z7.s[1]\n"
+    "fmla z20.s, z28.s, z7.s[2]\n"
+    "fmla z19.s, z28.s, z0.s[3]\n"
+    "fmla z18.s, z28.s, z1.s[0]\n"
+    "fmla z17.s, z28.s, z1.s[1]\n"
+    "fmla z16.s, z28.s, z1.s[2]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "fmla z23.s, z27.s, z7.s[0]\n"
+    "fmla z22.s, z27.s, z7.s[1]\n"
+    "fmla z21.s, z27.s, z7.s[2]\n"
+    "fmla z20.s, z27.s, z7.s[3]\n"
+    "fmla z19.s, z27.s, z1.s[0]\n"
+    "fmla z18.s, z27.s, z1.s[1]\n"
+    "fmla z17.s, z27.s, z1.s[2]\n"
+    "fmla z16.s, z27.s, z1.s[3]\n"
+    "ld1w { z27.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "fmla z23.s, z31.s, z0.s[0]\n"
+    "fmla z22.s, z31.s, z0.s[1]\n"
+    "fmla z21.s, z31.s, z0.s[2]\n"
+    "fmla z20.s, z31.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z19.s, z31.s, z0.s[0]\n"
+    "fmla z18.s, z31.s, z0.s[1]\n"
+    "fmla z17.s, z31.s, z0.s[2]\n"
+    "fmla z16.s, z31.s, z0.s[3]\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "mov z0.d, z8.d\n"
+    "fmla z23.s, z30.s, z0.s[1]\n"
+    "fmla z22.s, z30.s, z0.s[2]\n"
+    "fmla z21.s, z30.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z20.s, z30.s, z1.s[0]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z19.s, z30.s, z0.s[1]\n"
+    "fmla z18.s, z30.s, z0.s[2]\n"
+    "fmla z17.s, z30.s, z0.s[3]\n"
+    "mov z0.d, z8.d\n"
+    "fmla z16.s, z30.s, z1.s[0]\n"
+    "ld1w { z30.s }, p2/Z, [%x[params]]\n"
+    "mov z1.d, z9.d\n"
+    "fmla z23.s, z29.s, z0.s[2]\n"
+    "fmla z22.s, z29.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z21.s, z29.s, z1.s[0]\n"
+    "fmla z20.s, z29.s, z1.s[1]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z19.s, z29.s, z0.s[2]\n"
+    "fmla z18.s, z29.s, z0.s[3]\n"
+    "mov z0.d, z8.d\n"
+    "fmla z17.s, z29.s, z1.s[0]\n"
+    "fmla z16.s, z29.s, z1.s[1]\n"
+    "ld1w { z29.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "mov z1.d, z9.d\n"
+    "fmla z23.s, z28.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z22.s, z28.s, z1.s[0]\n"
+    "fmla z21.s, z28.s, z1.s[1]\n"
+    "fmla z20.s, z28.s, z1.s[2]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z19.s, z28.s, z0.s[3]\n"
+    "fmla z18.s, z28.s, z1.s[0]\n"
+    "fmla z17.s, z28.s, z1.s[1]\n"
+    "fmla z16.s, z28.s, z1.s[2]\n"
+    "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "mov z1.d, z9.d\n"
+    "fmla z23.s, z27.s, z1.s[0]\n"
+    "fmla z22.s, z27.s, z1.s[1]\n"
+    "fmla z21.s, z27.s, z1.s[2]\n"
+    "fmla z20.s, z27.s, z1.s[3]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z19.s, z27.s, z1.s[0]\n"
+    "fmla z18.s, z27.s, z1.s[1]\n"
+    "fmla z17.s, z27.s, z1.s[2]\n"
+    "fmla z16.s, z27.s, z1.s[3]\n"
+    "ld1w { z27.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "fmla z23.s, z31.s, z0.s[0]\n"
+    "fmla z22.s, z31.s, z0.s[1]\n"
+    "fmla z21.s, z31.s, z0.s[2]\n"
+    "fmla z20.s, z31.s, z0.s[3]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z19.s, z31.s, z0.s[0]\n"
+    "fmla z18.s, z31.s, z0.s[1]\n"
+    "fmla z17.s, z31.s, z0.s[2]\n"
+    "fmla z16.s, z31.s, z0.s[3]\n"
+    "ld1w { z31.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z23.s, z30.s, z0.s[1]\n"
+    "fmla z22.s, z30.s, z0.s[2]\n"
+    "fmla z21.s, z30.s, z0.s[3]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z20.s, z30.s, z1.s[0]\n"
+    "mov z1.d, z13.d\n"
+    "fmla z19.s, z30.s, z0.s[1]\n"
+    "fmla z18.s, z30.s, z0.s[2]\n"
+    "fmla z17.s, z30.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z16.s, z30.s, z1.s[0]\n"
+    "ld1w { z30.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z23.s, z29.s, z0.s[2]\n"
+    "fmla z22.s, z29.s, z0.s[3]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z21.s, z29.s, z1.s[0]\n"
+    "fmla z20.s, z29.s, z1.s[1]\n"
+    "mov z1.d, z13.d\n"
+    "fmla z19.s, z29.s, z0.s[2]\n"
+    "fmla z18.s, z29.s, z0.s[3]\n"
+    "mov z0.d, z10.d\n"
+    "fmla z17.s, z29.s, z1.s[0]\n"
+    "fmla z16.s, z29.s, z1.s[1]\n"
+    "ld1w { z29.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z23.s, z28.s, z0.s[3]\n"
+    "mov z0.d, z12.d\n"
+    "fmla z22.s, z28.s, z1.s[0]\n"
+    "fmla z21.s, z28.s, z1.s[1]\n"
+    "fmla z20.s, z28.s, z1.s[2]\n"
+    "mov z1.d, z13.d\n"
+    "fmla z19.s, z28.s, z0.s[3]\n"
+    "fmla z18.s, z28.s, z1.s[0]\n"
+    "fmla z17.s, z28.s, z1.s[1]\n"
+    "fmla z16.s, z28.s, z1.s[2]\n"
+    "mov z1.d, z11.d\n"
+    "fmla z23.s, z27.s, z1.s[0]\n"
+    "fmla z22.s, z27.s, z1.s[1]\n"
+    "fmla z21.s, z27.s, z1.s[2]\n"
+    "fmla z20.s, z27.s, z1.s[3]\n"
+    "mov z1.d, z13.d\n"
+    "fmla z19.s, z27.s, z1.s[0]\n"
+    "fmla z18.s, z27.s, z1.s[1]\n"
+    "fmla z17.s, z27.s, z1.s[2]\n"
+    "fmla z16.s, z27.s, z1.s[3]\n"
+    "fmin z23.s, p2/M, z23.s, z24.s\n"
+    "fmin z22.s, p2/M, z22.s, z24.s\n"
+    "fmin z21.s, p2/M, z21.s, z24.s\n"
+    "fmin z20.s, p2/M, z20.s, z24.s\n"
+    "fmax z23.s, p2/M, z23.s, z25.s\n"
+    "st1w { z23.s }, p0, [x11, x24, LSL #2]\n"
+    "fmax z22.s, p2/M, z22.s, z25.s\n"
+    "fmax z21.s, p2/M, z21.s, z25.s\n"
+    "ld1w { z23.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "fmax z20.s, p2/M, z20.s, z25.s\n"
+    "ld1w { z28.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+    "fmin z19.s, p2/M, z19.s, z24.s\n"
+    "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+    "addvl %x[params], %x[params], #-6\n"
+    "fmin z18.s, p2/M, z18.s, z24.s\n"
+    "st1w { z22.s }, p0, [x10, x24, LSL #2]\n"
+    "mov z22.d, z23.d\n"
+    "st1w { z21.s }, p0, [x9, x24, LSL #2]\n"
+    "mov z21.d, z23.d\n"
+    "st1w { z20.s }, p0, [x28, x24, LSL #2]\n"
+    "mov z20.d, z23.d\n"
+    "fmax z19.s, p2/M, z19.s, z25.s\n"
+    "st1w { z19.s }, p0, [x26, x24, LSL #2]\n"
+    "mov z19.d, z23.d\n"
+    "fmax z18.s, p2/M, z18.s, z25.s\n"
+    "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
+    "mov z18.d, z23.d\n"
+    "fmin z17.s, p2/M, z17.s, z24.s\n"
+    "fmin z16.s, p2/M, z16.s, z24.s\n"
+    "fmax z17.s, p2/M, z17.s, z25.s\n"
+    "st1w { z17.s }, p0, [x23, x24, LSL #2]\n"
+    "mov z17.d, z23.d\n"
+    "fmax z16.s, p2/M, z16.s, z25.s\n"
+    "st1w { z16.s }, p0, [x22, x24, LSL #2]\n"
+    "mov z16.d, z23.d\n"
+    "incw x24\n"
+    "b.any 1b\n"
+    : [params] "+&r" (params)
+    : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000..7a4bd1d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp

@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+{
+  typedef float bias_type;
+  typedef float input_type;
+  typedef float weight_type;
+  typedef float return_type;
+
+  typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int output_rows(void) { return 2; };
+  constexpr static unsigned int output_cols(void) { return 8; };
+
+  constexpr static unsigned int output_col_regs(void) { return 2; };
+
+  kern_type kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+
+  sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..0124370
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp

@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+  const float *const *const inptrs,
+  float *const *const outptrs,
+  const float *weights,
+  const float *bias,
+  const unsigned int kernel_points,
+  const unsigned int n_output_channels,
+  const float activation_min,
+  const float activation_max
+)
+{
+  const float minmax_vals[2] = { activation_min, activation_max };
+
+  __asm__ __volatile__(
+    "ptrue p1.b\n"
+    "ld1rw { z11.s }, p1/Z, [%x[minmax_vals]]\n"
+    "mov x28, #0x0\n"
+    "ld1rw { z10.s }, p1/Z, [%x[minmax_vals], #4]\n"
+    "whilelt p0.s, x28, %x[n_output_channels]\n"
+    "1:"  // Output channel loop
+    "mov z16.b, #0x0\n"
+    "cbz %x[bias], 2f\n"
+    "ld1w { z16.s }, p0/Z, [%x[bias], x28, LSL #2]\n"
+    "2:"  // Output channel loop: Load bias: Done
+    "mov z9.d, z16.d\n"
+    "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+    "mov x20, %x[inptrs]\n"
+    "mov z31.d, z16.d\n"
+    "ldp x24, x27, [x20], #0x10\n"
+    "lsr x19, %x[kernel_points], #0x1\n"
+    "mov z30.d, z16.d\n"
+    "ld1rqw { z7.s }, p1/Z, [x24]\n"
+    "mov z29.d, z16.d\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "mov z28.d, z16.d\n"
+    "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
+    "mov z27.d, z16.d\n"
+    "ld1rqw { z5.s }, p1/Z, [x27]\n"
+    "mov z26.d, z16.d\n"
+    "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
+    "mov z25.d, z16.d\n"
+    "mov z24.d, z16.d\n"
+    "mov z23.d, z16.d\n"
+    "mov z22.d, z16.d\n"
+    "mov z21.d, z16.d\n"
+    "mov z20.d, z16.d\n"
+    "mov z19.d, z16.d\n"
+    "mov z18.d, z16.d\n"
+    "mov z17.d, z16.d\n"
+    "cbz x19, 6f\n"
+    "ldp x24, x27, [x20], #0x10\n"
+    "ld1w { z16.s }, p1/Z, [%x[weights]]\n"
+    "subs x19, x19, #0x1\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "ld1rqw { z3.s }, p1/Z, [x24]\n"
+    "ld1rqw { z2.s }, p1/Z, [x24, #16]\n"
+    "ld1rqw { z1.s }, p1/Z, [x27]\n"
+    "ld1rqw { z0.s }, p1/Z, [x27, #16]\n"
+    "beq 4f\n"
+    "3:"  // Output channel loop: Kernel loop
+    "fmla z9.s, z8.s, z7.s[0]\n"
+    "ldp x24, x27, [x20], #0x10\n"
+    "subs x19, x19, #0x1\n"
+    "fmla z31.s, z8.s, z7.s[1]\n"
+    "fmla z30.s, z8.s, z7.s[2]\n"
+    "fmla z29.s, z8.s, z7.s[3]\n"
+    "ld1rqw { z7.s }, p1/Z, [x24]\n"
+    "fmla z28.s, z8.s, z6.s[0]\n"
+    "fmla z27.s, z8.s, z6.s[1]\n"
+    "fmla z26.s, z8.s, z6.s[2]\n"
+    "fmla z25.s, z8.s, z6.s[3]\n"
+    "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
+    "fmla z24.s, z8.s, z5.s[0]\n"
+    "fmla z23.s, z8.s, z5.s[1]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z21.s, z8.s, z5.s[3]\n"
+    "ld1rqw { z5.s }, p1/Z, [x27]\n"
+    "fmla z20.s, z8.s, z4.s[0]\n"
+    "fmla z19.s, z8.s, z4.s[1]\n"
+    "fmla z18.s, z8.s, z4.s[2]\n"
+    "fmla z17.s, z8.s, z4.s[3]\n"
+    "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
+    "fmla z9.s, z16.s, z3.s[0]\n"
+    "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+    "fmla z31.s, z16.s, z3.s[1]\n"
+    "ldp x24, x27, [x20], #0x10\n"
+    "fmla z30.s, z16.s, z3.s[2]\n"
+    "fmla z29.s, z16.s, z3.s[3]\n"
+    "ld1rqw { z3.s }, p1/Z, [x24]\n"
+    "fmla z28.s, z16.s, z2.s[0]\n"
+    "fmla z27.s, z16.s, z2.s[1]\n"
+    "fmla z26.s, z16.s, z2.s[2]\n"
+    "fmla z25.s, z16.s, z2.s[3]\n"
+    "ld1rqw { z2.s }, p1/Z, [x24, #16]\n"
+    "fmla z24.s, z16.s, z1.s[0]\n"
+    "fmla z23.s, z16.s, z1.s[1]\n"
+    "fmla z22.s, z16.s, z1.s[2]\n"
+    "fmla z21.s, z16.s, z1.s[3]\n"
+    "ld1rqw { z1.s }, p1/Z, [x27]\n"
+    "fmla z20.s, z16.s, z0.s[0]\n"
+    "fmla z19.s, z16.s, z0.s[1]\n"
+    "fmla z18.s, z16.s, z0.s[2]\n"
+    "fmla z17.s, z16.s, z0.s[3]\n"
+    "ld1rqw { z0.s }, p1/Z, [x27, #16]\n"
+    "ld1w { z16.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
+    "addvl %x[weights], %x[weights], #2\n"
+    "bgt 3b\n"
+    "4:"  // Output channel loop: Kernel loop tail
+    "tbnz %x[kernel_points], #0, 5f\n"
+    "fmla z9.s, z8.s, z7.s[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "fmla z31.s, z8.s, z7.s[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "fmla z30.s, z8.s, z7.s[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla z29.s, z8.s, z7.s[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla z28.s, z8.s, z6.s[0]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla z27.s, z8.s, z6.s[1]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla z26.s, z8.s, z6.s[2]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla z25.s, z8.s, z6.s[3]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla z24.s, z8.s, z5.s[0]\n"
+    "fmla z23.s, z8.s, z5.s[1]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z21.s, z8.s, z5.s[3]\n"
+    "fmla z20.s, z8.s, z4.s[0]\n"
+    "fmla z19.s, z8.s, z4.s[1]\n"
+    "fmla z18.s, z8.s, z4.s[2]\n"
+    "fmla z17.s, z8.s, z4.s[3]\n"
+    "fmla z9.s, z16.s, z3.s[0]\n"
+    "fmla z31.s, z16.s, z3.s[1]\n"
+    "fmla z30.s, z16.s, z3.s[2]\n"
+    "fmla z29.s, z16.s, z3.s[3]\n"
+    "fmla z28.s, z16.s, z2.s[0]\n"
+    "fmla z27.s, z16.s, z2.s[1]\n"
+    "fmla z26.s, z16.s, z2.s[2]\n"
+    "fmla z25.s, z16.s, z2.s[3]\n"
+    "fmla z24.s, z16.s, z1.s[0]\n"
+    "fmla z23.s, z16.s, z1.s[1]\n"
+    "fmla z22.s, z16.s, z1.s[2]\n"
+    "fmla z21.s, z16.s, z1.s[3]\n"
+    "fmla z20.s, z16.s, z0.s[0]\n"
+    "fmla z19.s, z16.s, z0.s[1]\n"
+    "fmla z18.s, z16.s, z0.s[2]\n"
+    "fmla z17.s, z16.s, z0.s[3]\n"
+    "fmin z9.s, p1/M, z9.s, z10.s\n"
+    "fmin z31.s, p1/M, z31.s, z10.s\n"
+    "fmin z30.s, p1/M, z30.s, z10.s\n"
+    "fmin z29.s, p1/M, z29.s, z10.s\n"
+    "fmax z9.s, p1/M, z9.s, z11.s\n"
+    "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
+    "fmax z31.s, p1/M, z31.s, z11.s\n"
+    "fmax z30.s, p1/M, z30.s, z11.s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmax z29.s, p1/M, z29.s, z11.s\n"
+    "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
+    "fmin z28.s, p1/M, z28.s, z10.s\n"
+    "fmin z27.s, p1/M, z27.s, z10.s\n"
+    "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
+    "fmin z26.s, p1/M, z26.s, z10.s\n"
+    "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
+    "fmin z25.s, p1/M, z25.s, z10.s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin z24.s, p1/M, z24.s, z10.s\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax z28.s, p1/M, z28.s, z11.s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax z27.s, p1/M, z27.s, z11.s\n"
+    "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
+    "fmax z26.s, p1/M, z26.s, z11.s\n"
+    "fmax z25.s, p1/M, z25.s, z11.s\n"
+    "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
+    "fmax z24.s, p1/M, z24.s, z11.s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin z23.s, p1/M, z23.s, z10.s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmin z22.s, p1/M, z22.s, z10.s\n"
+    "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
+    "fmin z21.s, p1/M, z21.s, z10.s\n"
+    "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
+    "fmin z20.s, p1/M, z20.s, z10.s\n"
+    "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
+    "fmax z23.s, p1/M, z23.s, z11.s\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax z22.s, p1/M, z22.s, z11.s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax z21.s, p1/M, z21.s, z11.s\n"
+    "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
+    "fmax z20.s, p1/M, z20.s, z11.s\n"
+    "fmin z19.s, p1/M, z19.s, z10.s\n"
+    "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
+    "fmin z18.s, p1/M, z18.s, z10.s\n"
+    "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
+    "fmin z17.s, p1/M, z17.s, z10.s\n"
+    "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
+    "fmax z19.s, p1/M, z19.s, z11.s\n"
+    "fmax z18.s, p1/M, z18.s, z11.s\n"
+    "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
+    "fmax z17.s, p1/M, z17.s, z11.s\n"
+    "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
+    "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+    "b 7f\n"
+    "5:"  // Output channel loop: Odd tail
+    "fmla z9.s, z8.s, z7.s[0]\n"
+    "ldp x24, x27, [x20], #0x10\n"
+    "fmla z31.s, z8.s, z7.s[1]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "fmla z30.s, z8.s, z7.s[2]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "fmla z29.s, z8.s, z7.s[3]\n"
+    "ld1rqw { z7.s }, p1/Z, [x24]\n"
+    "fmla z28.s, z8.s, z6.s[0]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla z27.s, z8.s, z6.s[1]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla z26.s, z8.s, z6.s[2]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla z25.s, z8.s, z6.s[3]\n"
+    "ld1rqw { z6.s }, p1/Z, [x24, #16]\n"
+    "fmla z24.s, z8.s, z5.s[0]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla z23.s, z8.s, z5.s[1]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla z21.s, z8.s, z5.s[3]\n"
+    "ld1rqw { z5.s }, p1/Z, [x27]\n"
+    "fmla z20.s, z8.s, z4.s[0]\n"
+    "fmla z19.s, z8.s, z4.s[1]\n"
+    "fmla z18.s, z8.s, z4.s[2]\n"
+    "fmla z17.s, z8.s, z4.s[3]\n"
+    "ld1rqw { z4.s }, p1/Z, [x27, #16]\n"
+    "fmla z9.s, z16.s, z3.s[0]\n"
+    "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+    "addvl %x[weights], %x[weights], #1\n"
+    "fmla z31.s, z16.s, z3.s[1]\n"
+    "fmla z30.s, z16.s, z3.s[2]\n"
+    "fmla z29.s, z16.s, z3.s[3]\n"
+    "fmla z28.s, z16.s, z2.s[0]\n"
+    "fmla z27.s, z16.s, z2.s[1]\n"
+    "fmla z26.s, z16.s, z2.s[2]\n"
+    "fmla z25.s, z16.s, z2.s[3]\n"
+    "fmla z24.s, z16.s, z1.s[0]\n"
+    "fmla z23.s, z16.s, z1.s[1]\n"
+    "fmla z22.s, z16.s, z1.s[2]\n"
+    "fmla z21.s, z16.s, z1.s[3]\n"
+    "fmla z20.s, z16.s, z0.s[0]\n"
+    "fmla z19.s, z16.s, z0.s[1]\n"
+    "fmla z18.s, z16.s, z0.s[2]\n"
+    "fmla z17.s, z16.s, z0.s[3]\n"
+    "fmla z9.s, z8.s, z7.s[0]\n"
+    "fmla z31.s, z8.s, z7.s[1]\n"
+    "fmla z30.s, z8.s, z7.s[2]\n"
+    "fmla z29.s, z8.s, z7.s[3]\n"
+    "fmla z28.s, z8.s, z6.s[0]\n"
+    "fmla z27.s, z8.s, z6.s[1]\n"
+    "fmla z26.s, z8.s, z6.s[2]\n"
+    "fmla z25.s, z8.s, z6.s[3]\n"
+    "fmla z24.s, z8.s, z5.s[0]\n"
+    "fmla z23.s, z8.s, z5.s[1]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z21.s, z8.s, z5.s[3]\n"
+    "fmla z20.s, z8.s, z4.s[0]\n"
+    "fmla z19.s, z8.s, z4.s[1]\n"
+    "fmla z18.s, z8.s, z4.s[2]\n"
+    "fmla z17.s, z8.s, z4.s[3]\n"
+    "fmin z9.s, p1/M, z9.s, z10.s\n"
+    "fmin z31.s, p1/M, z31.s, z10.s\n"
+    "fmin z30.s, p1/M, z30.s, z10.s\n"
+    "fmin z29.s, p1/M, z29.s, z10.s\n"
+    "fmax z9.s, p1/M, z9.s, z11.s\n"
+    "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
+    "fmax z31.s, p1/M, z31.s, z11.s\n"
+    "fmax z30.s, p1/M, z30.s, z11.s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmax z29.s, p1/M, z29.s, z11.s\n"
+    "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
+    "fmin z28.s, p1/M, z28.s, z10.s\n"
+    "fmin z27.s, p1/M, z27.s, z10.s\n"
+    "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
+    "fmin z26.s, p1/M, z26.s, z10.s\n"
+    "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
+    "fmin z25.s, p1/M, z25.s, z10.s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin z24.s, p1/M, z24.s, z10.s\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax z28.s, p1/M, z28.s, z11.s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax z27.s, p1/M, z27.s, z11.s\n"
+    "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
+    "fmax z26.s, p1/M, z26.s, z11.s\n"
+    "fmax z25.s, p1/M, z25.s, z11.s\n"
+    "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
+    "fmax z24.s, p1/M, z24.s, z11.s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin z23.s, p1/M, z23.s, z10.s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmin z22.s, p1/M, z22.s, z10.s\n"
+    "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
+    "fmin z21.s, p1/M, z21.s, z10.s\n"
+    "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
+    "fmin z20.s, p1/M, z20.s, z10.s\n"
+    "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
+    "fmax z23.s, p1/M, z23.s, z11.s\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax z22.s, p1/M, z22.s, z11.s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax z21.s, p1/M, z21.s, z11.s\n"
+    "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
+    "fmax z20.s, p1/M, z20.s, z11.s\n"
+    "fmin z19.s, p1/M, z19.s, z10.s\n"
+    "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
+    "fmin z18.s, p1/M, z18.s, z10.s\n"
+    "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
+    "fmin z17.s, p1/M, z17.s, z10.s\n"
+    "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
+    "fmax z19.s, p1/M, z19.s, z11.s\n"
+    "fmax z18.s, p1/M, z18.s, z11.s\n"
+    "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
+    "fmax z17.s, p1/M, z17.s, z11.s\n"
+    "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
+    "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+    "b 7f\n"
+    "6:"  // Output channel loop: Single kernel point
+    "fmla z9.s, z8.s, z7.s[0]\n"
+    "ldr x19, [%x[outptrs], #0x0]\n"
+    "fmla z31.s, z8.s, z7.s[1]\n"
+    "ldr x20, [%x[outptrs], #0x8]\n"
+    "fmla z30.s, z8.s, z7.s[2]\n"
+    "ldr x21, [%x[outptrs], #0x10]\n"
+    "fmla z29.s, z8.s, z7.s[3]\n"
+    "ldr x22, [%x[outptrs], #0x18]\n"
+    "fmla z28.s, z8.s, z6.s[0]\n"
+    "ldr x23, [%x[outptrs], #0x20]\n"
+    "fmla z27.s, z8.s, z6.s[1]\n"
+    "ldr x24, [%x[outptrs], #0x28]\n"
+    "fmla z26.s, z8.s, z6.s[2]\n"
+    "ldr x25, [%x[outptrs], #0x30]\n"
+    "fmla z25.s, z8.s, z6.s[3]\n"
+    "ldr x26, [%x[outptrs], #0x38]\n"
+    "fmla z24.s, z8.s, z5.s[0]\n"
+    "fmla z23.s, z8.s, z5.s[1]\n"
+    "fmla z22.s, z8.s, z5.s[2]\n"
+    "fmla z21.s, z8.s, z5.s[3]\n"
+    "fmla z20.s, z8.s, z4.s[0]\n"
+    "fmla z19.s, z8.s, z4.s[1]\n"
+    "fmla z18.s, z8.s, z4.s[2]\n"
+    "fmla z17.s, z8.s, z4.s[3]\n"
+    "fmin z9.s, p1/M, z9.s, z10.s\n"
+    "fmin z31.s, p1/M, z31.s, z10.s\n"
+    "fmin z30.s, p1/M, z30.s, z10.s\n"
+    "fmin z29.s, p1/M, z29.s, z10.s\n"
+    "fmax z9.s, p1/M, z9.s, z11.s\n"
+    "st1w { z9.s }, p0, [x19, x28, LSL #2]\n"
+    "fmax z31.s, p1/M, z31.s, z11.s\n"
+    "fmax z30.s, p1/M, z30.s, z11.s\n"
+    "ldr x19, [%x[outptrs], #0x40]\n"
+    "fmax z29.s, p1/M, z29.s, z11.s\n"
+    "st1w { z31.s }, p0, [x20, x28, LSL #2]\n"
+    "fmin z28.s, p1/M, z28.s, z10.s\n"
+    "fmin z27.s, p1/M, z27.s, z10.s\n"
+    "st1w { z30.s }, p0, [x21, x28, LSL #2]\n"
+    "fmin z26.s, p1/M, z26.s, z10.s\n"
+    "st1w { z29.s }, p0, [x22, x28, LSL #2]\n"
+    "fmin z25.s, p1/M, z25.s, z10.s\n"
+    "ldr x20, [%x[outptrs], #0x48]\n"
+    "fmin z24.s, p1/M, z24.s, z10.s\n"
+    "ldr x21, [%x[outptrs], #0x50]\n"
+    "fmax z28.s, p1/M, z28.s, z11.s\n"
+    "ldr x22, [%x[outptrs], #0x58]\n"
+    "fmax z27.s, p1/M, z27.s, z11.s\n"
+    "st1w { z28.s }, p0, [x23, x28, LSL #2]\n"
+    "fmax z26.s, p1/M, z26.s, z11.s\n"
+    "fmax z25.s, p1/M, z25.s, z11.s\n"
+    "st1w { z27.s }, p0, [x24, x28, LSL #2]\n"
+    "fmax z24.s, p1/M, z24.s, z11.s\n"
+    "ldr x23, [%x[outptrs], #0x60]\n"
+    "fmin z23.s, p1/M, z23.s, z10.s\n"
+    "ldr x24, [%x[outptrs], #0x68]\n"
+    "fmin z22.s, p1/M, z22.s, z10.s\n"
+    "st1w { z26.s }, p0, [x25, x28, LSL #2]\n"
+    "fmin z21.s, p1/M, z21.s, z10.s\n"
+    "st1w { z25.s }, p0, [x26, x28, LSL #2]\n"
+    "fmin z20.s, p1/M, z20.s, z10.s\n"
+    "st1w { z24.s }, p0, [x19, x28, LSL #2]\n"
+    "fmax z23.s, p1/M, z23.s, z11.s\n"
+    "ldr x25, [%x[outptrs], #0x70]\n"
+    "fmax z22.s, p1/M, z22.s, z11.s\n"
+    "ldr x26, [%x[outptrs], #0x78]\n"
+    "fmax z21.s, p1/M, z21.s, z11.s\n"
+    "st1w { z23.s }, p0, [x20, x28, LSL #2]\n"
+    "fmax z20.s, p1/M, z20.s, z11.s\n"
+    "fmin z19.s, p1/M, z19.s, z10.s\n"
+    "st1w { z22.s }, p0, [x21, x28, LSL #2]\n"
+    "fmin z18.s, p1/M, z18.s, z10.s\n"
+    "st1w { z21.s }, p0, [x22, x28, LSL #2]\n"
+    "fmin z17.s, p1/M, z17.s, z10.s\n"
+    "st1w { z20.s }, p0, [x23, x28, LSL #2]\n"
+    "fmax z19.s, p1/M, z19.s, z11.s\n"
+    "fmax z18.s, p1/M, z18.s, z11.s\n"
+    "st1w { z19.s }, p0, [x24, x28, LSL #2]\n"
+    "fmax z17.s, p1/M, z17.s, z11.s\n"
+    "st1w { z18.s }, p0, [x25, x28, LSL #2]\n"
+    "st1w { z17.s }, p0, [x26, x28, LSL #2]\n"
+    "7:"  // Output channel loop: Done
+    "incw x28\n"
+    "whilelt p0.s, x28, %x[n_output_channels]\n"
+    "b.any 1b\n"
+    : [weights] "+&r" (weights)
+    : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+    : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..295e1f6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size;
+
+  kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+  sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..90f924a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+  __asm__ __volatile__(
+    "ldp x11, x10, [%x[inptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x10]\n"
+    "addvl SP, SP, #-8\n"
+    "ldp x27, x26, [%x[inptrs], #0x20]\n"
+    "mov x19, #0x1\n"
+    "ldp x25, x24, [%x[inptrs], #0x30]\n"
+    "orr x19, x19, #0x100\n"
+    "ldp x23, x22, [%x[outptrs], #0x0]\n"
+    "orr x19, x19, #0x10000\n"
+    "dup z12.s, w19\n"
+    "ldp x21, x20, [%x[outptrs], #0x10]\n"
+    "mov x19, #0x0\n"
+    "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "whilelt p1.b, x19, %x[n_channels]\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "1:"  // Loop
+    "mov z7.s, #0x0\n"
+    "ld1b { z19.b }, p1/Z, [x11, x19]\n"
+    "whilelt p0.s, x19, %x[n_channels]\n"
+    "mov z6.s, #0x0\n"
+    "ld1b { z18.b }, p1/Z, [x10, x19]\n"
+    "ldp x11, x10, [%x[inptrs], #0x40]\n"
+    "ld1b { z16.b }, p1/Z, [x9, x19]\n"
+    "zip1 z21.b, z19.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x28, x19]\n"
+    "zip2 z19.b, z19.b, z16.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x50]\n"
+    "ld1b { z23.b }, p1/Z, [x27, x19]\n"
+    "zip1 z16.b, z18.b, z17.b\n"
+    "ld1b { z20.b }, p1/Z, [x26, x19]\n"
+    "zip2 z18.b, z18.b, z17.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x60]\n"
+    "zip1 z5.b, z21.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x25, x19]\n"
+    "zip2 z4.b, z21.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x24, x19]\n"
+    "zip1 z29.b, z19.b, z18.b\n"
+    "ldp x25, x24, [%x[inptrs], #0x70]\n"
+    "zip2 z28.b, z19.b, z18.b\n"
+    "ld1b { z22.b }, p1/Z, [x11, x19]\n"
+    "zip1 z19.b, z23.b, z17.b\n"
+    "ld1b { z21.b }, p1/Z, [x10, x19]\n"
+    "zip2 z27.b, z23.b, z17.b\n"
+    "ldp x11, x10, [%x[inptrs], #0x0]\n"
+    "zip1 z18.b, z20.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x9, x19]\n"
+    "zip2 z20.b, z20.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x28, x19]\n"
+    "zip1 z3.b, z19.b, z18.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x10]\n"
+    "zip2 z2.b, z19.b, z18.b\n"
+    "ld1b { z19.b }, p1/Z, [x27, x19]\n"
+    "zip1 z26.b, z22.b, z17.b\n"
+    "ld1b { z25.b }, p1/Z, [x26, x19]\n"
+    "zip2 z24.b, z22.b, z17.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x20]\n"
+    "zip1 z23.b, z21.b, z16.b\n"
+    "ld1b { z18.b }, p1/Z, [x25, x19]\n"
+    "zip2 z22.b, z21.b, z16.b\n"
+    "ld1b { z21.b }, p1/Z, [x24, x19]\n"
+    "zip1 z17.b, z27.b, z20.b\n"
+    "ldp x25, x24, [%x[inptrs], #0x30]\n"
+    "zip2 z16.b, z27.b, z20.b\n"
+    "st1b { z29.b }, p2, [SP]\n"
+    "zip1 z20.b, z19.b, z18.b\n"
+    "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
+    "zip2 z19.b, z19.b, z18.b\n"
+    "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
+    "zip1 z18.b, z25.b, z21.b\n"
+    "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
+    "zip2 z17.b, z25.b, z21.b\n"
+    "ld1w { z1.s }, p2/Z, [%x[params]]\n"
+    "zip1 z0.b, z26.b, z23.b\n"
+    "ld1b { z31.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "zip2 z30.b, z26.b, z23.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "zip1 z16.b, z24.b, z22.b\n"
+    "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
+    "zip2 z16.b, z24.b, z22.b\n"
+    "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
+    "zip1 z28.b, z20.b, z18.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "zip2 z26.b, z20.b, z18.b\n"
+    "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "zip1 z16.b, z19.b, z17.b\n"
+    "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
+    "zip2 z16.b, z19.b, z17.b\n"
+    "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
+    "mov z24.d, z1.d\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "mov z22.d, z1.d\n"
+    "mov z21.d, z1.d\n"
+    "sdot z1.s, z31.b, z5.b\n"
+    "sdot z22.s, z31.b, z3.b\n"
+    "sdot z7.s, z12.b, z3.b\n"
+    "sdot z1.s, z29.b, z3.b\n"
+    "ext z3.b, z3.b, z3.b, #0x1\n"
+    "sdot z22.s, z29.b, z0.b\n"
+    "sdot z7.s, z12.b, z0.b\n"
+    "sdot z1.s, z27.b, z0.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "sdot z22.s, z27.b, z28.b\n"
+    "mov z20.d, z7.d\n"
+    "sdot z7.s, z12.b, z5.b\n"
+    "sdot z20.s, z12.b, z28.b\n"
+    "ext z5.b, z5.b, z5.b, #0x1\n"
+    "ext z28.b, z28.b, z28.b, #0x1\n"
+    "sdot z21.s, z31.b, z3.b\n"
+    "sdot z6.s, z12.b, z3.b\n"
+    "sdot z24.s, z31.b, z5.b\n"
+    "ld1b { z31.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "mls z1.s, p2/M, z7.s, z9.s\n"
+    "sdot z21.s, z29.b, z0.b\n"
+    "sdot z6.s, z12.b, z0.b\n"
+    "sdot z24.s, z29.b, z3.b\n"
+    "ld1b { z3.b }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x04b97421  // sqrdmulh z1.s, z1.s, z25.s\n"
+    "sdot z21.s, z27.b, z28.b\n"
+    "mov z19.d, z6.d\n"
+    "sdot z24.s, z27.b, z0.b\n"
+    "ld1b { z0.b }, p2/Z, [SP, #4, MUL VL]\n"
+    "sdot z6.s, z12.b, z5.b\n"
+    "ld1b { z5.b }, p2/Z, [SP]\n"
+    "sdot z19.s, z12.b, z28.b\n"
+    "ld1b { z28.b }, p2/Z, [SP, #6, MUL VL]\n"
+    "and z16.d, z1.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "mov z7.s, #0x0\n"
+    "mls z24.s, p2/M, z6.s, z9.s\n"
+    "sdot z7.s, z12.b, z2.b\n"
+    "mov z6.s, #0x0\n"
+    "mls z22.s, p2/M, z20.s, z9.s\n"
+    ".inst 0x04b97718  // sqrdmulh z24.s, z24.s, z25.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    "sdot z7.s, z12.b, z30.b\n"
+    ".inst 0x04b976d6  // sqrdmulh z22.s, z22.s, z25.s\n"
+    "and z18.d, z24.d, z23.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z17.d, z22.d, z23.d\n"
+    "mov z20.d, z7.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sdot z7.s, z12.b, z4.b\n"
+    "sdot z20.s, z12.b, z26.b\n"
+    "mls z21.s, p2/M, z19.s, z9.s\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    ".inst 0x44828ae1  // srshl z1.s, p2/M, z1.s, z23.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x04b976b5  // sqrdmulh z21.s, z21.s, z25.s\n"
+    ".inst 0x44828af8  // srshl z24.s, p2/M, z24.s, z23.s\n"
+    "add z1.s, z1.s, z8.s\n"
+    "and z16.d, z21.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "add z24.s, z24.s, z8.s\n"
+    "smax z1.s, p2/M, z1.s, z11.s\n"
+    ".inst 0x44828af6  // srshl z22.s, p2/M, z22.s, z23.s\n"
+    "smax z24.s, p2/M, z24.s, z11.s\n"
+    "smin z1.s, p2/M, z1.s, z10.s\n"
+    "st1b { z1.s }, p0, [x23, x19]\n"
+    "add z22.s, z22.s, z8.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "smin z24.s, p2/M, z24.s, z10.s\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smax z22.s, p2/M, z22.s, z11.s\n"
+    "ld1w { z25.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    ".inst 0x44828af5  // srshl z21.s, p2/M, z21.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "smin z22.s, p2/M, z22.s, z10.s\n"
+    "st1b { z24.s }, p0, [x22, x19]\n"
+    "mov z24.d, z1.d\n"
+    "st1b { z22.s }, p0, [x21, x19]\n"
+    "add z21.s, z21.s, z8.s\n"
+    "mov z22.d, z1.d\n"
+    "sdot z22.s, z31.b, z2.b\n"
+    "smax z21.s, p2/M, z21.s, z11.s\n"
+    "sdot z22.s, z29.b, z30.b\n"
+    "smin z21.s, p2/M, z21.s, z10.s\n"
+    "st1b { z21.s }, p0, [x20, x19]\n"
+    "mov z21.d, z1.d\n"
+    "incw x19\n"
+    "sdot z1.s, z31.b, z4.b\n"
+    "whilelt p0.s, x19, %x[n_channels]\n"
+    "sdot z22.s, z27.b, z26.b\n"
+    "ext z4.b, z4.b, z4.b, #0x1\n"
+    "ext z26.b, z26.b, z26.b, #0x1\n"
+    "sdot z1.s, z29.b, z2.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "sdot z24.s, z31.b, z4.b\n"
+    "mls z22.s, p2/M, z20.s, z9.s\n"
+    "sdot z1.s, z27.b, z30.b\n"
+    "ext z30.b, z30.b, z30.b, #0x1\n"
+    "sdot z21.s, z31.b, z2.b\n"
+    "ld1b { z31.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "sdot z24.s, z29.b, z2.b\n"
+    "sdot z6.s, z12.b, z2.b\n"
+    "ld1b { z2.b }, p2/Z, [SP, #3, MUL VL]\n"
+    ".inst 0x04b976d6  // sqrdmulh z22.s, z22.s, z25.s\n"
+    "sdot z21.s, z29.b, z30.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "sdot z24.s, z27.b, z30.b\n"
+    "sdot z6.s, z12.b, z30.b\n"
+    "ld1b { z30.b }, p2/Z, [SP, #5, MUL VL]\n"
+    "and z17.d, z22.d, z23.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sdot z21.s, z27.b, z26.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "mov z19.d, z6.d\n"
+    "sdot z6.s, z12.b, z4.b\n"
+    "ld1b { z4.b }, p2/Z, [SP, #1, MUL VL]\n"
+    "sdot z19.s, z12.b, z26.b\n"
+    "ld1b { z26.b }, p2/Z, [SP, #7, MUL VL]\n"
+    "mls z1.s, p2/M, z7.s, z9.s\n"
+    "mov z7.s, #0x0\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    "sdot z7.s, z12.b, z3.b\n"
+    ".inst 0x04b97421  // sqrdmulh z1.s, z1.s, z25.s\n"
+    "mls z24.s, p2/M, z6.s, z9.s\n"
+    "mov z6.s, #0x0\n"
+    "sdot z7.s, z12.b, z0.b\n"
+    "and z16.d, z1.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b97718  // sqrdmulh z24.s, z24.s, z25.s\n"
+    "mov z20.d, z7.d\n"
+    "sdot z7.s, z12.b, z5.b\n"
+    "sdot z20.s, z12.b, z28.b\n"
+    "mls z21.s, p2/M, z19.s, z9.s\n"
+    "and z18.d, z24.d, z23.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x04b976b5  // sqrdmulh z21.s, z21.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [%x[params]]\n"
+    ".inst 0x44828af6  // srshl z22.s, p2/M, z22.s, z23.s\n"
+    "and z16.d, z21.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    ".inst 0x44828ae1  // srshl z1.s, p2/M, z1.s, z23.s\n"
+    "smax z22.s, p2/M, z22.s, z11.s\n"
+    ".inst 0x44828af8  // srshl z24.s, p2/M, z24.s, z23.s\n"
+    "add z1.s, z1.s, z8.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "smin z22.s, p2/M, z22.s, z10.s\n"
+    "st1b { z22.s }, p0, [x21, x19]\n"
+    "add z24.s, z24.s, z8.s\n"
+    "smax z1.s, p2/M, z1.s, z11.s\n"
+    ".inst 0x44828af5  // srshl z21.s, p2/M, z21.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "smax z24.s, p2/M, z24.s, z11.s\n"
+    "smin z1.s, p2/M, z1.s, z10.s\n"
+    "st1b { z1.s }, p0, [x23, x19]\n"
+    "add z21.s, z21.s, z8.s\n"
+    "smin z24.s, p2/M, z24.s, z10.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "smax z21.s, p2/M, z21.s, z11.s\n"
+    "st1b { z24.s }, p0, [x22, x19]\n"
+    "mov z24.d, z1.d\n"
+    "mov z22.d, z1.d\n"
+    "sdot z22.s, z31.b, z3.b\n"
+    "smin z21.s, p2/M, z21.s, z10.s\n"
+    "st1b { z21.s }, p0, [x20, x19]\n"
+    "mov z21.d, z1.d\n"
+    "incw x19\n"
+    "sdot z1.s, z31.b, z5.b\n"
+    "whilelt p0.s, x19, %x[n_channels]\n"
+    "sdot z22.s, z29.b, z0.b\n"
+    "ext z5.b, z5.b, z5.b, #0x1\n"
+    "sdot z1.s, z29.b, z3.b\n"
+    "sdot z22.s, z27.b, z28.b\n"
+    "ext z3.b, z3.b, z3.b, #0x1\n"
+    "ext z28.b, z28.b, z28.b, #0x1\n"
+    "sdot z24.s, z31.b, z5.b\n"
+    "sdot z1.s, z27.b, z0.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "sdot z21.s, z31.b, z3.b\n"
+    "ld1b { z31.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z24.s, z29.b, z3.b\n"
+    "sdot z6.s, z12.b, z3.b\n"
+    "mls z1.s, p2/M, z7.s, z9.s\n"
+    "sdot z21.s, z29.b, z0.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "sdot z24.s, z27.b, z0.b\n"
+    "sdot z6.s, z12.b, z0.b\n"
+    ".inst 0x04b97421  // sqrdmulh z1.s, z1.s, z25.s\n"
+    "sdot z21.s, z27.b, z28.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "mov z7.s, #0x0\n"
+    "mov z19.d, z6.d\n"
+    "sdot z6.s, z12.b, z5.b\n"
+    "sdot z19.s, z12.b, z28.b\n"
+    "and z16.d, z1.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sdot z7.s, z12.b, z2.b\n"
+    "mls z24.s, p2/M, z6.s, z9.s\n"
+    "mov z6.s, #0x0\n"
+    "mls z22.s, p2/M, z20.s, z9.s\n"
+    "mls z21.s, p2/M, z19.s, z9.s\n"
+    ".inst 0x04b97718  // sqrdmulh z24.s, z24.s, z25.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x04b976d6  // sqrdmulh z22.s, z22.s, z25.s\n"
+    ".inst 0x04b976b5  // sqrdmulh z21.s, z21.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z18.d, z24.d, z23.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z17.d, z22.d, z23.d\n"
+    "and z16.d, z21.d, z23.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sdot z7.s, z12.b, z30.b\n"
+    ".inst 0x44828ae1  // srshl z1.s, p2/M, z1.s, z23.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "add z1.s, z1.s, z8.s\n"
+    "mov z20.d, z7.d\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "sdot z7.s, z12.b, z4.b\n"
+    "sdot z20.s, z12.b, z26.b\n"
+    "smax z1.s, p2/M, z1.s, z11.s\n"
+    ".inst 0x44828af8  // srshl z24.s, p2/M, z24.s, z23.s\n"
+    ".inst 0x44828af6  // srshl z22.s, p2/M, z22.s, z23.s\n"
+    ".inst 0x44828af5  // srshl z21.s, p2/M, z21.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "smin z1.s, p2/M, z1.s, z10.s\n"
+    "st1b { z1.s }, p0, [x23, x19]\n"
+    "add z24.s, z24.s, z8.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "add z21.s, z21.s, z8.s\n"
+    "smax z24.s, p2/M, z24.s, z11.s\n"
+    "smax z22.s, p2/M, z22.s, z11.s\n"
+    "smax z21.s, p2/M, z21.s, z11.s\n"
+    "smin z24.s, p2/M, z24.s, z10.s\n"
+    "st1b { z24.s }, p0, [x22, x19]\n"
+    "mov z24.d, z1.d\n"
+    "smin z22.s, p2/M, z22.s, z10.s\n"
+    "st1b { z22.s }, p0, [x21, x19]\n"
+    "mov z22.d, z1.d\n"
+    "smin z21.s, p2/M, z21.s, z10.s\n"
+    "st1b { z21.s }, p0, [x20, x19]\n"
+    "mov z21.d, z1.d\n"
+    "incw x19\n"
+    "sdot z1.s, z31.b, z4.b\n"
+    "whilelt p0.s, x19, %x[n_channels]\n"
+    "sdot z22.s, z31.b, z2.b\n"
+    "ext z4.b, z4.b, z4.b, #0x1\n"
+    "sdot z1.s, z29.b, z2.b\n"
+    "sdot z22.s, z29.b, z30.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "sdot z24.s, z31.b, z4.b\n"
+    "sdot z1.s, z27.b, z30.b\n"
+    "sdot z22.s, z27.b, z26.b\n"
+    "ext z30.b, z30.b, z30.b, #0x1\n"
+    "ext z26.b, z26.b, z26.b, #0x1\n"
+    "sdot z21.s, z31.b, z2.b\n"
+    "sdot z24.s, z29.b, z2.b\n"
+    "sdot z6.s, z12.b, z2.b\n"
+    "mls z1.s, p2/M, z7.s, z9.s\n"
+    "sdot z21.s, z29.b, z30.b\n"
+    "sdot z24.s, z27.b, z30.b\n"
+    "sdot z6.s, z12.b, z30.b\n"
+    ".inst 0x04b97421  // sqrdmulh z1.s, z1.s, z25.s\n"
+    "sdot z21.s, z27.b, z26.b\n"
+    "mls z22.s, p2/M, z20.s, z9.s\n"
+    "mov z19.d, z6.d\n"
+    "sdot z6.s, z12.b, z4.b\n"
+    "sdot z19.s, z12.b, z26.b\n"
+    "and z16.d, z1.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b976d6  // sqrdmulh z22.s, z22.s, z25.s\n"
+    "mls z24.s, p2/M, z6.s, z9.s\n"
+    "mls z21.s, p2/M, z19.s, z9.s\n"
+    ".inst 0x04b97718  // sqrdmulh z24.s, z24.s, z25.s\n"
+    "and z17.d, z22.d, z23.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x04b976b5  // sqrdmulh z21.s, z21.s, z25.s\n"
+    "and z18.d, z24.d, z23.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z16.d, z21.d, z23.d\n"
+    ".inst 0x44828ae1  // srshl z1.s, p2/M, z1.s, z23.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    "add z1.s, z1.s, z8.s\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "smax z1.s, p2/M, z1.s, z11.s\n"
+    ".inst 0x44828af6  // srshl z22.s, p2/M, z22.s, z23.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    ".inst 0x44828af8  // srshl z24.s, p2/M, z24.s, z23.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    "smin z1.s, p2/M, z1.s, z10.s\n"
+    "st1b { z1.s }, p0, [x23, x19]\n"
+    "add z24.s, z24.s, z8.s\n"
+    "smax z22.s, p2/M, z22.s, z11.s\n"
+    ".inst 0x44828af5  // srshl z21.s, p2/M, z21.s, z23.s\n"
+    "smax z24.s, p2/M, z24.s, z11.s\n"
+    "smin z22.s, p2/M, z22.s, z10.s\n"
+    "st1b { z22.s }, p0, [x21, x19]\n"
+    "add z21.s, z21.s, z8.s\n"
+    "smin z24.s, p2/M, z24.s, z10.s\n"
+    "st1b { z24.s }, p0, [x22, x19]\n"
+    "smax z21.s, p2/M, z21.s, z11.s\n"
+    "smin z21.s, p2/M, z21.s, z10.s\n"
+    "st1b { z21.s }, p0, [x20, x19]\n"
+    "incw x19\n"
+    "whilelt p1.b, x19, %x[n_channels]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #8\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..7dd241a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+  sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..8bf5bad
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x15, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z18.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z15.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z13.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x15, x17\n"
+    "ld1rw { z14.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x15, x17\n"
+    "ldp x10, x9, [x21, #0x0]\n"
+    "mov x19, x15\n"
+    "incw x19\n"
+    "ldp x28, x27, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x17\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z17.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z11.s, z17.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z17.s, z17.s, z16.s\n"
+    "mov z9.d, z11.d\n"
+    "ld1sb { z0.h }, p4/Z, [x16]\n"
+    ".inst 0x45521000  // ssublb z0.h, z0.b, z18.b\n"
+    "mov z20.d, z17.d\n"
+    "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+    "mov z24.d, z11.d\n"
+    "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+    ".inst 0x45521021  // ssublb z1.h, z1.b, z18.b\n"
+    "mov z19.d, z17.d\n"
+    "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+    "mov z26.d, z11.d\n"
+    "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+    ".inst 0x45521042  // ssublb z2.h, z2.b, z18.b\n"
+    "mov z23.d, z17.d\n"
+    "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+    ".inst 0x45521063  // ssublb z3.h, z3.b, z18.b\n"
+    "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+    ".inst 0x45521084  // ssublb z4.h, z4.b, z18.b\n"
+    "inch x16, ALL, MUL #8\n"
+    "ld1sb { z8.h }, p4/Z, [x16]\n"
+    "ldp x23, x22, [x12, #0x0]\n"
+    ".inst 0x455210a5  // ssublb z5.h, z5.b, z18.b\n"
+    ".inst 0x455210c6  // ssublb z6.h, z6.b, z18.b\n"
+    "ldp x21, x20, [x12, #0x10]\n"
+    ".inst 0x455210e7  // ssublb z7.h, z7.b, z18.b\n"
+    ".inst 0x45521108  // ssublb z8.h, z8.b, z18.b\n"
+    "ldr x19, [x12, #0x20]\n"
+    "ld1sb { z31.h }, p3/Z, [x23, x15]\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    "ld1sb { z30.h }, p3/Z, [x22, x15]\n"
+    "ld1sb { z29.h }, p3/Z, [x21, x15]\n"
+    ".inst 0x454c13de  // ssublb z30.h, z30.b, z12.b\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x15]\n"
+    "ld1sb { z27.h }, p3/Z, [x19, x15]\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c139c  // ssublb z28.h, z28.b, z12.b\n"
+    ".inst 0x454c137b  // ssublb z27.h, z27.b, z12.b\n"
+    "1:"  // Loop
+    ".inst 0x448443eb  // smlalb z11.s, p4/M, z31.h, z4.h\n"
+    "ldr x21, [x12, #0x28]\n"
+    "whilelt p0.h, x14, x17\n"
+    ".inst 0x448447f1  // smlalt z17.s, p4/M, z31.h, z4.h\n"
+    "ldr x20, [x12, #0x30]\n"
+    "inch x16\n"
+    ".inst 0x448343e9  // smlalb z9.s, p4/M, z31.h, z3.h\n"
+    "ldr x26, [x12, #0x38]\n"
+    ".inst 0x448347f4  // smlalt z20.s, p4/M, z31.h, z3.h\n"
+    "ldr x25, [x12, #0x40]\n"
+    ".inst 0x448143f8  // smlalb z24.s, p4/M, z31.h, z1.h\n"
+    "ldr x19, [x12, #0x48]\n"
+    ".inst 0x448147f3  // smlalt z19.s, p4/M, z31.h, z1.h\n"
+    "ldr x24, [x12, #0x50]\n"
+    ".inst 0x448043fa  // smlalb z26.s, p4/M, z31.h, z0.h\n"
+    "ldr x23, [x12, #0x58]\n"
+    ".inst 0x448047f7  // smlalt z23.s, p4/M, z31.h, z0.h\n"
+    "ld1sb { z31.h }, p3/Z, [x21, x15]\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    ".inst 0x448043cb  // smlalb z11.s, p4/M, z30.h, z0.h\n"
+    "ldr x22, [x12, #0x60]\n"
+    ".inst 0x448047d1  // smlalt z17.s, p4/M, z30.h, z0.h\n"
+    "ld1sb { z30.h }, p3/Z, [x19, x15]\n"
+    ".inst 0x454c13de  // ssublb z30.h, z30.b, z12.b\n"
+    ".inst 0x448243a9  // smlalb z9.s, p4/M, z29.h, z2.h\n"
+    "ldr x21, [x12, #0x68]\n"
+    ".inst 0x448247b4  // smlalt z20.s, p4/M, z29.h, z2.h\n"
+    "ld1sb { z29.h }, p3/Z, [x20, x15]\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x4485438b  // smlalb z11.s, p4/M, z28.h, z5.h\n"
+    "ldr x20, [x12, #0x70]\n"
+    ".inst 0x44854791  // smlalt z17.s, p4/M, z28.h, z5.h\n"
+    "ldr x19, [x12, #0x78]\n"
+    ".inst 0x44844389  // smlalb z9.s, p4/M, z28.h, z4.h\n"
+    "ld1w { z25.s }, p2/Z, [x13]\n"
+    ".inst 0x44844794  // smlalt z20.s, p4/M, z28.h, z4.h\n"
+    "ld1w { z16.s }, p1/Z, [x13, #1, MUL VL]\n"
+    "addvl x13, x13, #2\n"
+    ".inst 0x44824398  // smlalb z24.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44824793  // smlalt z19.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x4481439a  // smlalb z26.s, p4/M, z28.h, z1.h\n"
+    "uzp1 z10.s, z25.s, z16.s\n"
+    "uzp2 z22.s, z25.s, z16.s\n"
+    "ld1w { z25.s }, p2/Z, [x11]\n"
+    ".inst 0x44814797  // smlalt z23.s, p4/M, z28.h, z1.h\n"
+    "ld1sb { z28.h }, p3/Z, [x26, x15]\n"
+    ".inst 0x454c139c  // ssublb z28.h, z28.b, z12.b\n"
+    ".inst 0x448643f8  // smlalb z24.s, p4/M, z31.h, z6.h\n"
+    "ld1w { z16.s }, p1/Z, [x11, #1, MUL VL]\n"
+    ".inst 0x448647f3  // smlalt z19.s, p4/M, z31.h, z6.h\n"
+    "ld1sb { z31.h }, p3/Z, [x25, x15]\n"
+    "addvl x11, x11, #2\n"
+    ".inst 0x4487436b  // smlalb z11.s, p4/M, z27.h, z7.h\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    "uzp1 z21.s, z25.s, z16.s\n"
+    "uzp2 z25.s, z25.s, z16.s\n"
+    ".inst 0x44874771  // smlalt z17.s, p4/M, z27.h, z7.h\n"
+    ".inst 0x44864369  // smlalb z9.s, p4/M, z27.h, z6.h\n"
+    ".inst 0x44864774  // smlalt z20.s, p4/M, z27.h, z6.h\n"
+    ".inst 0x44844378  // smlalb z24.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x44844773  // smlalt z19.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4483437a  // smlalb z26.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834777  // smlalt z23.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x4481438b  // smlalb z11.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814791  // smlalt z17.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x448843ba  // smlalb z26.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x448847b7  // smlalt z23.s, p4/M, z29.h, z8.h\n"
+    "ld1sb { z29.h }, p3/Z, [x24, x15]\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x44804389  // smlalb z9.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x44804794  // smlalt z20.s, p4/M, z28.h, z0.h\n"
+    "ld1sb { z28.h }, p3/Z, [x23, x15]\n"
+    ".inst 0x454c139c  // ssublb z28.h, z28.b, z12.b\n"
+    ".inst 0x448243eb  // smlalb z11.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448247f1  // smlalt z17.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448143e9  // smlalb z9.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147f4  // smlalt z20.s, p4/M, z31.h, z1.h\n"
+    "ld1sb { z31.h }, p3/Z, [x22, x15]\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    ".inst 0x448843cb  // smlalb z11.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x448847d1  // smlalt z17.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x448743c9  // smlalb z9.s, p4/M, z30.h, z7.h\n"
+    ".inst 0x448747d4  // smlalt z20.s, p4/M, z30.h, z7.h\n"
+    ".inst 0x448543d8  // smlalb z24.s, p4/M, z30.h, z5.h\n"
+    ".inst 0x448547d3  // smlalt z19.s, p4/M, z30.h, z5.h\n"
+    ".inst 0x448443da  // smlalb z26.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x448447d7  // smlalt z23.s, p4/M, z30.h, z4.h\n"
+    "ld1sb { z30.h }, p3/Z, [x21, x15]\n"
+    ".inst 0x454c13de  // ssublb z30.h, z30.b, z12.b\n"
+    ".inst 0x448343ab  // smlalb z11.s, p4/M, z29.h, z3.h\n"
+    ".inst 0x448347b1  // smlalt z17.s, p4/M, z29.h, z3.h\n"
+    ".inst 0x448043b8  // smlalb z24.s, p4/M, z29.h, z0.h\n"
+    ".inst 0x448047b3  // smlalt z19.s, p4/M, z29.h, z0.h\n"
+    "ld1sb { z29.h }, p3/Z, [x20, x15]\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x44854389  // smlalb z9.s, p4/M, z28.h, z5.h\n"
+    ".inst 0x44854794  // smlalt z20.s, p4/M, z28.h, z5.h\n"
+    ".inst 0x4482439a  // smlalb z26.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44824797  // smlalt z23.s, p4/M, z28.h, z2.h\n"
+    "ld1sb { z28.h }, p3/Z, [x19, x15]\n"
+    "inch x15\n"
+    ".inst 0x448643eb  // smlalb z11.s, p4/M, z31.h, z6.h\n"
+    "whilelt p2.s, x15, x17\n"
+    ".inst 0x448647f1  // smlalt z17.s, p4/M, z31.h, z6.h\n"
+    "mov x19, x15\n"
+    ".inst 0x448343f8  // smlalb z24.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x454c139c  // ssublb z28.h, z28.b, z12.b\n"
+    ".inst 0x448347f3  // smlalt z19.s, p4/M, z31.h, z3.h\n"
+    "incw x19\n"
+    ".inst 0x448843c9  // smlalb z9.s, p4/M, z30.h, z8.h\n"
+    "whilelt p1.s, x19, x17\n"
+    ".inst 0x04aa756b  // sqrdmulh z11.s, z11.s, z10.s\n"
+    "whilelt p3.h, x15, x17\n"
+    ".inst 0x04b67631  // sqrdmulh z17.s, z17.s, z22.s\n"
+    ".inst 0x448847d4  // smlalt z20.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x04aa7529  // sqrdmulh z9.s, z9.s, z10.s\n"
+    "and z16.d, z11.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z1.d, z17.d, z25.d\n"
+    "and z27.d, z9.d, z21.d\n"
+    "asr z1.s, z1.s, #0x1f\n"
+    ".inst 0x04b67694  // sqrdmulh z20.s, z20.s, z22.s\n"
+    ".inst 0x448543da  // smlalb z26.s, p4/M, z30.h, z5.h\n"
+    "asr z27.s, z27.s, #0x1f\n"
+    ".inst 0x448547d7  // smlalt z23.s, p4/M, z30.h, z5.h\n"
+    "sqadd z11.s, z11.s, z16.s\n"
+    ".inst 0x448743b8  // smlalb z24.s, p4/M, z29.h, z7.h\n"
+    "and z16.d, z20.d, z25.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z17.s, z17.s, z1.s\n"
+    "sqadd z9.s, z9.s, z27.s\n"
+    ".inst 0x448747b3  // smlalt z19.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x448643ba  // smlalb z26.s, p4/M, z29.h, z6.h\n"
+    ".inst 0x448647b7  // smlalt z23.s, p4/M, z29.h, z6.h\n"
+    ".inst 0x44884398  // smlalb z24.s, p4/M, z28.h, z8.h\n"
+    "sqadd z20.s, z20.s, z16.s\n"
+    ".inst 0x44884793  // smlalt z19.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x4487439a  // smlalb z26.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x04aa7718  // sqrdmulh z24.s, z24.s, z10.s\n"
+    ".inst 0x44874797  // smlalt z23.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x04b67673  // sqrdmulh z19.s, z19.s, z22.s\n"
+    ".inst 0x04aa775a  // sqrdmulh z26.s, z26.s, z10.s\n"
+    "and z16.d, z24.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z7.d, z19.d, z25.d\n"
+    "and z3.d, z26.d, z21.d\n"
+    "asr z7.s, z7.s, #0x1f\n"
+    ".inst 0x04b676f7  // sqrdmulh z23.s, z23.s, z22.s\n"
+    ".inst 0x448292ab  // srshl z11.s, p4/M, z11.s, z21.s\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    ".inst 0x44829331  // srshl z17.s, p4/M, z17.s, z25.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    ".inst 0x448292a9  // srshl z9.s, p4/M, z9.s, z21.s\n"
+    "add z11.s, z11.s, z15.s\n"
+    "add z17.s, z17.s, z15.s\n"
+    "sqadd z19.s, z19.s, z7.s\n"
+    "add z9.s, z9.s, z15.s\n"
+    "sqadd z26.s, z26.s, z3.s\n"
+    "and z16.d, z23.d, z25.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "smin z11.s, p4/M, z11.s, z14.s\n"
+    "smin z17.s, p4/M, z17.s, z14.s\n"
+    "smin z9.s, p4/M, z9.s, z14.s\n"
+    ".inst 0x44829334  // srshl z20.s, p4/M, z20.s, z25.s\n"
+    ".inst 0x448292b8  // srshl z24.s, p4/M, z24.s, z21.s\n"
+    "smax z11.s, p4/M, z11.s, z13.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    "add z20.s, z20.s, z15.s\n"
+    "add z24.s, z24.s, z15.s\n"
+    "smax z17.s, p4/M, z17.s, z13.s\n"
+    "smax z9.s, p4/M, z9.s, z13.s\n"
+    "smin z20.s, p4/M, z20.s, z14.s\n"
+    "smin z24.s, p4/M, z24.s, z14.s\n"
+    "trn1 z11.h, z11.h, z17.h\n"
+    "st1b { z11.h }, p0, [x10, x14]\n"
+    "smax z20.s, p4/M, z20.s, z13.s\n"
+    ".inst 0x44829333  // srshl z19.s, p4/M, z19.s, z25.s\n"
+    "smax z24.s, p4/M, z24.s, z13.s\n"
+    ".inst 0x448292ba  // srshl z26.s, p4/M, z26.s, z21.s\n"
+    ".inst 0x44829337  // srshl z23.s, p4/M, z23.s, z25.s\n"
+    "trn1 z9.h, z9.h, z20.h\n"
+    "st1b { z9.h }, p0, [x9, x14]\n"
+    "add z19.s, z19.s, z15.s\n"
+    "add z26.s, z26.s, z15.s\n"
+    "add z23.s, z23.s, z15.s\n"
+    "smin z19.s, p4/M, z19.s, z14.s\n"
+    "smin z26.s, p4/M, z26.s, z14.s\n"
+    "smin z23.s, p4/M, z23.s, z14.s\n"
+    "smax z19.s, p4/M, z19.s, z13.s\n"
+    "smax z26.s, p4/M, z26.s, z13.s\n"
+    "smax z23.s, p4/M, z23.s, z13.s\n"
+    "trn1 z24.h, z24.h, z19.h\n"
+    "st1b { z24.h }, p0, [x28, x14]\n"
+    "trn1 z26.h, z26.h, z23.h\n"
+    "st1b { z26.h }, p0, [x27, x14]\n"
+    "inch x14\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z17.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z11.s, z17.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z17.s, z17.s, z16.s\n"
+    "mov z9.d, z11.d\n"
+    "ld1sb { z0.h }, p4/Z, [x16]\n"
+    ".inst 0x45521000  // ssublb z0.h, z0.b, z18.b\n"
+    "mov z20.d, z17.d\n"
+    "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+    "mov z24.d, z11.d\n"
+    "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+    ".inst 0x45521021  // ssublb z1.h, z1.b, z18.b\n"
+    "mov z19.d, z17.d\n"
+    "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+    "mov z26.d, z11.d\n"
+    "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+    ".inst 0x45521042  // ssublb z2.h, z2.b, z18.b\n"
+    "mov z23.d, z17.d\n"
+    "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+    ".inst 0x45521063  // ssublb z3.h, z3.b, z18.b\n"
+    "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+    ".inst 0x45521084  // ssublb z4.h, z4.b, z18.b\n"
+    "inch x16, ALL, MUL #8\n"
+    "ld1sb { z8.h }, p4/Z, [x16]\n"
+    "ldp x23, x22, [x12, #0x0]\n"
+    ".inst 0x455210a5  // ssublb z5.h, z5.b, z18.b\n"
+    ".inst 0x455210c6  // ssublb z6.h, z6.b, z18.b\n"
+    "ldp x21, x20, [x12, #0x10]\n"
+    ".inst 0x455210e7  // ssublb z7.h, z7.b, z18.b\n"
+    ".inst 0x45521108  // ssublb z8.h, z8.b, z18.b\n"
+    "ldr x19, [x12, #0x20]\n"
+    "ld1sb { z31.h }, p3/Z, [x23, x15]\n"
+    ".inst 0x454c13ff  // ssublb z31.h, z31.b, z12.b\n"
+    "ld1sb { z30.h }, p3/Z, [x22, x15]\n"
+    "ld1sb { z29.h }, p3/Z, [x21, x15]\n"
+    ".inst 0x454c13de  // ssublb z30.h, z30.b, z12.b\n"
+    "ld1sb { z28.h }, p3/Z, [x20, x15]\n"
+    "ld1sb { z27.h }, p3/Z, [x19, x15]\n"
+    ".inst 0x454c13bd  // ssublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c139c  // ssublb z28.h, z28.b, z12.b\n"
+    ".inst 0x454c137b  // ssublb z27.h, z27.b, z12.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..89507ef
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+  sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..b773ca1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x5, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x7, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x8, #0x0\n"
+    "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x16, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z19.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z14.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z20.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x7, x5\n"
+    "ld1rw { z15.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x7, x5\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x19, x7\n"
+    "incw x19\n"
+    "ldp x12, x11, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x5\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z18.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z13.s, z18.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z16.s, z18.s, z16.s\n"
+    "mov z11.d, z13.d\n"
+    "ld1sb { z0.h }, p4/Z, [x6]\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    "mov z9.d, z16.d\n"
+    "ld1sb { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
+    "mov z18.d, z13.d\n"
+    "ld1sb { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
+    ".inst 0x454c1021  // ssublb z1.h, z1.b, z12.b\n"
+    "mov z10.d, z16.d\n"
+    "ld1sb { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
+    "mov z22.d, z13.d\n"
+    "ld1sb { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+    ".inst 0x454c1042  // ssublb z2.h, z2.b, z12.b\n"
+    "mov z23.d, z16.d\n"
+    "ld1sb { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
+    ".inst 0x454c1063  // ssublb z3.h, z3.b, z12.b\n"
+    "ld1sb { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    "inch x6, ALL, MUL #8\n"
+    "ld1sb { z8.h }, p4/Z, [x6]\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    ".inst 0x454c10a5  // ssublb z5.h, z5.b, z12.b\n"
+    ".inst 0x454c10c6  // ssublb z6.h, z6.b, z12.b\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    ".inst 0x454c10e7  // ssublb z7.h, z7.b, z12.b\n"
+    ".inst 0x454c1108  // ssublb z8.h, z8.b, z12.b\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "ld1sb { z31.h }, p3/Z, [x26, x7]\n"
+    ".inst 0x455313ff  // ssublb z31.h, z31.b, z19.b\n"
+    "ld1sb { z30.h }, p3/Z, [x25, x7]\n"
+    "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
+    ".inst 0x455313de  // ssublb z30.h, z30.b, z19.b\n"
+    "ld1sb { z28.h }, p3/Z, [x23, x7]\n"
+    "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x455313bd  // ssublb z29.h, z29.b, z19.b\n"
+    "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x4553139c  // ssublb z28.h, z28.b, z19.b\n"
+    "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+    "ld1sb { z24.h }, p3/Z, [x19, x7]\n"
+    ".inst 0x4553137b  // ssublb z27.h, z27.b, z19.b\n"
+    ".inst 0x4553135a  // ssublb z26.h, z26.b, z19.b\n"
+    ".inst 0x45531339  // ssublb z25.h, z25.b, z19.b\n"
+    ".inst 0x45531318  // ssublb z24.h, z24.b, z19.b\n"
+    "1:"  // Loop
+    ".inst 0x448843ed  // smlalb z13.s, p4/M, z31.h, z8.h\n"
+    "ldr x23, [x16, #0x40]\n"
+    "whilelt p0.h, x8, x5\n"
+    ".inst 0x448847f0  // smlalt z16.s, p4/M, z31.h, z8.h\n"
+    "ldr x22, [x16, #0x48]\n"
+    "inch x6\n"
+    ".inst 0x448643eb  // smlalb z11.s, p4/M, z31.h, z6.h\n"
+    "ldr x21, [x16, #0x50]\n"
+    ".inst 0x448647e9  // smlalt z9.s, p4/M, z31.h, z6.h\n"
+    "ldr x20, [x16, #0x58]\n"
+    ".inst 0x448243f2  // smlalb z18.s, p4/M, z31.h, z2.h\n"
+    "ldr x19, [x16, #0x60]\n"
+    ".inst 0x448247ea  // smlalt z10.s, p4/M, z31.h, z2.h\n"
+    "ldr x10, [x16, #0x68]\n"
+    ".inst 0x448043f6  // smlalb z22.s, p4/M, z31.h, z0.h\n"
+    "ldr x9, [x16, #0x70]\n"
+    ".inst 0x448047f7  // smlalt z23.s, p4/M, z31.h, z0.h\n"
+    "ldr x28, [x16, #0x78]\n"
+    ".inst 0x448043cd  // smlalb z13.s, p4/M, z30.h, z0.h\n"
+    "ldr x27, [x16, #0x80]\n"
+    ".inst 0x448047d0  // smlalt z16.s, p4/M, z30.h, z0.h\n"
+    "ldr x26, [x16, #0x88]\n"
+    ".inst 0x4481438b  // smlalb z11.s, p4/M, z28.h, z1.h\n"
+    "ldr x25, [x16, #0x90]\n"
+    ".inst 0x44814789  // smlalt z9.s, p4/M, z28.h, z1.h\n"
+    "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x4553139c  // ssublb z28.h, z28.b, z19.b\n"
+    ".inst 0x448143ad  // smlalb z13.s, p4/M, z29.h, z1.h\n"
+    "ldr x24, [x16, #0x98]\n"
+    ".inst 0x448147b0  // smlalt z16.s, p4/M, z29.h, z1.h\n"
+    "ld1sb { z29.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x455313bd  // ssublb z29.h, z29.b, z19.b\n"
+    ".inst 0x4482436b  // smlalb z11.s, p4/M, z27.h, z2.h\n"
+    "ldr x23, [x16, #0xa0]\n"
+    ".inst 0x44824769  // smlalt z9.s, p4/M, z27.h, z2.h\n"
+    "ld1sb { z27.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x4553137b  // ssublb z27.h, z27.b, z19.b\n"
+    ".inst 0x4483434d  // smlalb z13.s, p4/M, z26.h, z3.h\n"
+    "ldr x22, [x16, #0xa8]\n"
+    ".inst 0x44834750  // smlalt z16.s, p4/M, z26.h, z3.h\n"
+    "ld1sb { z26.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x4553135a  // ssublb z26.h, z26.b, z19.b\n"
+    ".inst 0x4484432d  // smlalb z13.s, p4/M, z25.h, z4.h\n"
+    "ldr x21, [x16, #0xb0]\n"
+    ".inst 0x44844730  // smlalt z16.s, p4/M, z25.h, z4.h\n"
+    "ld1sb { z25.h }, p3/Z, [x19, x7]\n"
+    ".inst 0x45531339  // ssublb z25.h, z25.b, z19.b\n"
+    ".inst 0x4482430d  // smlalb z13.s, p4/M, z24.h, z2.h\n"
+    "ldr x20, [x16, #0xb8]\n"
+    ".inst 0x44824710  // smlalt z16.s, p4/M, z24.h, z2.h\n"
+    "ldr x19, [x16, #0xc0]\n"
+    ".inst 0x4480430b  // smlalb z11.s, p4/M, z24.h, z0.h\n"
+    "ld1w { z21.s }, p2/Z, [x17]\n"
+    ".inst 0x44804709  // smlalt z9.s, p4/M, z24.h, z0.h\n"
+    "ld1sb { z24.h }, p3/Z, [x9, x7]\n"
+    ".inst 0x45531318  // ssublb z24.h, z24.b, z19.b\n"
+    ".inst 0x448443ab  // smlalb z11.s, p4/M, z29.h, z4.h\n"
+    "ld1w { z17.s }, p1/Z, [x17, #1, MUL VL]\n"
+    ".inst 0x448447a9  // smlalt z9.s, p4/M, z29.h, z4.h\n"
+    "ld1sb { z29.h }, p3/Z, [x10, x7]\n"
+    "addvl x17, x17, #2\n"
+    ".inst 0x4485436d  // smlalb z13.s, p4/M, z27.h, z5.h\n"
+    ".inst 0x455313bd  // ssublb z29.h, z29.b, z19.b\n"
+    "uzp1 z30.s, z21.s, z17.s\n"
+    "uzp2 z31.s, z21.s, z17.s\n"
+    "ld1w { z21.s }, p2/Z, [x15]\n"
+    ".inst 0x4485438b  // smlalb z11.s, p4/M, z28.h, z5.h\n"
+    "ld1w { z17.s }, p1/Z, [x15, #1, MUL VL]\n"
+    "addvl x15, x15, #2\n"
+    ".inst 0x44854789  // smlalt z9.s, p4/M, z28.h, z5.h\n"
+    "ld1sb { z28.h }, p3/Z, [x27, x7]\n"
+    ".inst 0x4553139c  // ssublb z28.h, z28.b, z19.b\n"
+    ".inst 0x44854770  // smlalt z16.s, p4/M, z27.h, z5.h\n"
+    ".inst 0x4483436b  // smlalb z11.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834769  // smlalt z9.s, p4/M, z27.h, z3.h\n"
+    "ld1sb { z27.h }, p3/Z, [x28, x7]\n"
+    ".inst 0x4553137b  // ssublb z27.h, z27.b, z19.b\n"
+    ".inst 0x44834352  // smlalb z18.s, p4/M, z26.h, z3.h\n"
+    ".inst 0x4483474a  // smlalt z10.s, p4/M, z26.h, z3.h\n"
+    "ld1sb { z26.h }, p3/Z, [x26, x7]\n"
+    ".inst 0x4553135a  // ssublb z26.h, z26.b, z19.b\n"
+    ".inst 0x4486432d  // smlalb z13.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x44864730  // smlalt z16.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x44804332  // smlalb z18.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x4480472a  // smlalt z10.s, p4/M, z25.h, z0.h\n"
+    "ld1sb { z25.h }, p3/Z, [x25, x7]\n"
+    ".inst 0x45531339  // ssublb z25.h, z25.b, z19.b\n"
+    "uzp1 z0.s, z21.s, z17.s\n"
+    "uzp2 z21.s, z21.s, z17.s\n"
+    ".inst 0x448443b2  // smlalb z18.s, p4/M, z29.h, z4.h\n"
+    ".inst 0x448447aa  // smlalt z10.s, p4/M, z29.h, z4.h\n"
+    "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
+    ".inst 0x455313bd  // ssublb z29.h, z29.b, z19.b\n"
+    ".inst 0x4487430d  // smlalb z13.s, p4/M, z24.h, z7.h\n"
+    ".inst 0x44874710  // smlalt z16.s, p4/M, z24.h, z7.h\n"
+    ".inst 0x44814312  // smlalb z18.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x4481470a  // smlalt z10.s, p4/M, z24.h, z1.h\n"
+    "ld1sb { z24.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x45531318  // ssublb z24.h, z24.b, z19.b\n"
+    ".inst 0x04be75ad  // sqrdmulh z13.s, z13.s, z30.s\n"
+    ".inst 0x04bf7610  // sqrdmulh z16.s, z16.s, z31.s\n"
+    ".inst 0x44844376  // smlalb z22.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x44844777  // smlalt z23.s, p4/M, z27.h, z4.h\n"
+    "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x4553137b  // ssublb z27.h, z27.b, z19.b\n"
+    "and z4.d, z13.d, z0.d\n"
+    "and z17.d, z16.d, z21.d\n"
+    "asr z4.s, z4.s, #0x1f\n"
+    ".inst 0x4487438b  // smlalb z11.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x44874789  // smlalt z9.s, p4/M, z28.h, z7.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x44814396  // smlalb z22.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814797  // smlalt z23.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44864332  // smlalb z18.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x4486472a  // smlalt z10.s, p4/M, z25.h, z6.h\n"
+    "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x45531339  // ssublb z25.h, z25.b, z19.b\n"
+    "sqadd z13.s, z13.s, z4.s\n"
+    "sqadd z16.s, z16.s, z17.s\n"
+    ".inst 0x44854356  // smlalb z22.s, p4/M, z26.h, z5.h\n"
+    ".inst 0x44854757  // smlalt z23.s, p4/M, z26.h, z5.h\n"
+    "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x4553135a  // ssublb z26.h, z26.b, z19.b\n"
+    ".inst 0x448843ab  // smlalb z11.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x448847a9  // smlalt z9.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x448243b6  // smlalb z22.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x448247b7  // smlalt z23.s, p4/M, z29.h, z2.h\n"
+    "ld1sb { z29.h }, p3/Z, [x19, x7]\n"
+    "inch x7\n"
+    ".inst 0x04be756b  // sqrdmulh z11.s, z11.s, z30.s\n"
+    "whilelt p2.s, x7, x5\n"
+    ".inst 0x04bf7529  // sqrdmulh z9.s, z9.s, z31.s\n"
+    "mov x19, x7\n"
+    ".inst 0x44874372  // smlalb z18.s, p4/M, z27.h, z7.h\n"
+    ".inst 0x455313bd  // ssublb z29.h, z29.b, z19.b\n"
+    ".inst 0x4487476a  // smlalt z10.s, p4/M, z27.h, z7.h\n"
+    "incw x19\n"
+    ".inst 0x44834316  // smlalb z22.s, p4/M, z24.h, z3.h\n"
+    "whilelt p1.s, x19, x5\n"
+    "and z1.d, z11.d, z0.d\n"
+    "whilelt p3.h, x7, x5\n"
+    "and z17.d, z9.d, z21.d\n"
+    "asr z1.s, z1.s, #0x1f\n"
+    ".inst 0x44854312  // smlalb z18.s, p4/M, z24.h, z5.h\n"
+    ".inst 0x4485470a  // smlalt z10.s, p4/M, z24.h, z5.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x44834717  // smlalt z23.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44874356  // smlalb z22.s, p4/M, z26.h, z7.h\n"
+    ".inst 0x4482900d  // srshl z13.s, p4/M, z13.s, z0.s\n"
+    ".inst 0x44884332  // smlalb z18.s, p4/M, z25.h, z8.h\n"
+    "sqadd z11.s, z11.s, z1.s\n"
+    "sqadd z9.s, z9.s, z17.s\n"
+    "add z13.s, z13.s, z14.s\n"
+    ".inst 0x04be7652  // sqrdmulh z18.s, z18.s, z30.s\n"
+    ".inst 0x44874757  // smlalt z23.s, p4/M, z26.h, z7.h\n"
+    ".inst 0x4488472a  // smlalt z10.s, p4/M, z25.h, z8.h\n"
+    ".inst 0x44864336  // smlalb z22.s, p4/M, z25.h, z6.h\n"
+    "and z17.d, z18.d, z0.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x04bf754a  // sqrdmulh z10.s, z10.s, z31.s\n"
+    ".inst 0x44864737  // smlalt z23.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x448843b6  // smlalb z22.s, p4/M, z29.h, z8.h\n"
+    "smin z13.s, p4/M, z13.s, z15.s\n"
+    ".inst 0x448292b0  // srshl z16.s, p4/M, z16.s, z21.s\n"
+    "and z1.d, z10.d, z21.d\n"
+    "asr z1.s, z1.s, #0x1f\n"
+    "add z16.s, z16.s, z14.s\n"
+    "sqadd z18.s, z18.s, z17.s\n"
+    ".inst 0x04be76d6  // sqrdmulh z22.s, z22.s, z30.s\n"
+    ".inst 0x448847b7  // smlalt z23.s, p4/M, z29.h, z8.h\n"
+    "smax z13.s, p4/M, z13.s, z20.s\n"
+    "smin z16.s, p4/M, z16.s, z15.s\n"
+    "sqadd z10.s, z10.s, z1.s\n"
+    "and z2.d, z22.d, z0.d\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x04bf76f7  // sqrdmulh z23.s, z23.s, z31.s\n"
+    "smax z16.s, p4/M, z16.s, z20.s\n"
+    ".inst 0x4482900b  // srshl z11.s, p4/M, z11.s, z0.s\n"
+    ".inst 0x448292a9  // srshl z9.s, p4/M, z9.s, z21.s\n"
+    ".inst 0x44829012  // srshl z18.s, p4/M, z18.s, z0.s\n"
+    "trn1 z13.h, z13.h, z16.h\n"
+    "st1b { z13.h }, p0, [x14, x8]\n"
+    "add z11.s, z11.s, z14.s\n"
+    "add z9.s, z9.s, z14.s\n"
+    "add z18.s, z18.s, z14.s\n"
+    "sqadd z22.s, z22.s, z2.s\n"
+    "and z16.d, z23.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "smin z11.s, p4/M, z11.s, z15.s\n"
+    "smin z9.s, p4/M, z9.s, z15.s\n"
+    "smin z18.s, p4/M, z18.s, z15.s\n"
+    ".inst 0x448292aa  // srshl z10.s, p4/M, z10.s, z21.s\n"
+    ".inst 0x44829016  // srshl z22.s, p4/M, z22.s, z0.s\n"
+    "smax z11.s, p4/M, z11.s, z20.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    "add z10.s, z10.s, z14.s\n"
+    "add z22.s, z22.s, z14.s\n"
+    "smax z9.s, p4/M, z9.s, z20.s\n"
+    "smax z18.s, p4/M, z18.s, z20.s\n"
+    "smin z10.s, p4/M, z10.s, z15.s\n"
+    "smin z22.s, p4/M, z22.s, z15.s\n"
+    "trn1 z11.h, z11.h, z9.h\n"
+    "st1b { z11.h }, p0, [x13, x8]\n"
+    "smax z10.s, p4/M, z10.s, z20.s\n"
+    ".inst 0x448292b7  // srshl z23.s, p4/M, z23.s, z21.s\n"
+    "smax z22.s, p4/M, z22.s, z20.s\n"
+    "trn1 z18.h, z18.h, z10.h\n"
+    "st1b { z18.h }, p0, [x12, x8]\n"
+    "add z23.s, z23.s, z14.s\n"
+    "smin z23.s, p4/M, z23.s, z15.s\n"
+    "smax z23.s, p4/M, z23.s, z20.s\n"
+    "trn1 z22.h, z22.h, z23.h\n"
+    "st1b { z22.h }, p0, [x11, x8]\n"
+    "inch x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z18.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z13.s, z18.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z16.s, z18.s, z16.s\n"
+    "mov z11.d, z13.d\n"
+    "ld1sb { z0.h }, p4/Z, [x6]\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    "mov z9.d, z16.d\n"
+    "ld1sb { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
+    "mov z18.d, z13.d\n"
+    "ld1sb { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
+    ".inst 0x454c1021  // ssublb z1.h, z1.b, z12.b\n"
+    "mov z10.d, z16.d\n"
+    "ld1sb { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
+    "mov z22.d, z13.d\n"
+    "ld1sb { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+    ".inst 0x454c1042  // ssublb z2.h, z2.b, z12.b\n"
+    "mov z23.d, z16.d\n"
+    "ld1sb { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
+    ".inst 0x454c1063  // ssublb z3.h, z3.b, z12.b\n"
+    "ld1sb { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
+    "ld1sb { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    "inch x6, ALL, MUL #8\n"
+    "ld1sb { z8.h }, p4/Z, [x6]\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    ".inst 0x454c10a5  // ssublb z5.h, z5.b, z12.b\n"
+    ".inst 0x454c10c6  // ssublb z6.h, z6.b, z12.b\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    ".inst 0x454c10e7  // ssublb z7.h, z7.b, z12.b\n"
+    ".inst 0x454c1108  // ssublb z8.h, z8.b, z12.b\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "ld1sb { z31.h }, p3/Z, [x26, x7]\n"
+    ".inst 0x455313ff  // ssublb z31.h, z31.b, z19.b\n"
+    "ld1sb { z30.h }, p3/Z, [x25, x7]\n"
+    "ld1sb { z29.h }, p3/Z, [x24, x7]\n"
+    ".inst 0x455313de  // ssublb z30.h, z30.b, z19.b\n"
+    "ld1sb { z28.h }, p3/Z, [x23, x7]\n"
+    "ld1sb { z27.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x455313bd  // ssublb z29.h, z29.b, z19.b\n"
+    "ld1sb { z26.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x4553139c  // ssublb z28.h, z28.b, z19.b\n"
+    "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+    "ld1sb { z24.h }, p3/Z, [x19, x7]\n"
+    ".inst 0x4553137b  // ssublb z27.h, z27.b, z19.b\n"
+    ".inst 0x4553135a  // ssublb z26.h, z26.b, z19.b\n"
+    ".inst 0x45531339  // ssublb z25.h, z25.b, z19.b\n"
+    ".inst 0x45531318  // ssublb z24.h, z24.b, z19.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..54ac1c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+struct sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size;
+
+  kern_type kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+  sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..c02bb58
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const int8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  int8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    int8_t *const *const outptrs;
+    const int8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const int8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      int8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x2, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x3, #0x0\n"
+    "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z17.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z13.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z14.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z5.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x2, x0\n"
+    "ld1rw { z15.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x2, x0\n"
+    "ldp x7, x8, [x21, #0x0]\n"
+    "mov x19, x2\n"
+    "incw x19\n"
+    "ldp x17, x16, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x0\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z19.s }, p2/Z, [x19]\n"
+    "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z11.s, z19.s, z6.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z16.s, z19.s, z6.s\n"
+    "mov z19.d, z11.d\n"
+    "ld1sb { z0.h }, p4/Z, [x1]\n"
+    ".inst 0x454d1000  // ssublb z0.h, z0.b, z13.b\n"
+    "mov z9.d, z16.d\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+    "mov z7.d, z11.d\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x454d1021  // ssublb z1.h, z1.b, z13.b\n"
+    "mov z6.d, z16.d\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+    "mov z12.d, z11.d\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    "mov z8.d, z16.d\n"
+    "ldp x28, x27, [x5, #0x0]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    "ldp x26, x25, [x5, #0x10]\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ldp x24, x23, [x5, #0x20]\n"
+    "ldp x22, x21, [x5, #0x30]\n"
+    "ldp x20, x19, [x5, #0x40]\n"
+    "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455113ff  // ssublb z31.h, z31.b, z17.b\n"
+    "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
+    "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x455113de  // ssublb z30.h, z30.b, z17.b\n"
+    "ld1sb { z28.h }, p3/Z, [x25, x2]\n"
+    "ld1sb { z27.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455113bd  // ssublb z29.h, z29.b, z17.b\n"
+    "ld1sb { z23.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x4551139c  // ssublb z28.h, z28.b, z17.b\n"
+    "ld1sb { z25.h }, p3/Z, [x22, x2]\n"
+    "ld1sb { z24.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x4551137b  // ssublb z27.h, z27.b, z17.b\n"
+    "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455112f7  // ssublb z23.h, z23.b, z17.b\n"
+    "ld1sb { z22.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x45511339  // ssublb z25.h, z25.b, z17.b\n"
+    ".inst 0x45511318  // ssublb z24.h, z24.b, z17.b\n"
+    ".inst 0x4551135a  // ssublb z26.h, z26.b, z17.b\n"
+    ".inst 0x455112d6  // ssublb z22.h, z22.b, z17.b\n"
+    "1:"  // Loop
+    ".inst 0x448043eb  // smlalb z11.s, p4/M, z31.h, z0.h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "whilelt p0.h, x3, x0\n"
+    ".inst 0x448047f0  // smlalt z16.s, p4/M, z31.h, z0.h\n"
+    "ldr x19, [x5, #0x58]\n"
+    ".inst 0x448043d3  // smlalb z19.s, p4/M, z30.h, z0.h\n"
+    "ldr x25, [x5, #0x60]\n"
+    ".inst 0x448047c9  // smlalt z9.s, p4/M, z30.h, z0.h\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455113ff  // ssublb z31.h, z31.b, z17.b\n"
+    ".inst 0x448043a7  // smlalb z7.s, p4/M, z29.h, z0.h\n"
+    "ldr x24, [x5, #0x68]\n"
+    ".inst 0x448047a6  // smlalt z6.s, p4/M, z29.h, z0.h\n"
+    "ldr x23, [x5, #0x70]\n"
+    ".inst 0x4480438c  // smlalb z12.s, p4/M, z28.h, z0.h\n"
+    "ldr x22, [x5, #0x78]\n"
+    ".inst 0x44804788  // smlalt z8.s, p4/M, z28.h, z0.h\n"
+    "ld1sb { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x454d1000  // ssublb z0.h, z0.b, z13.b\n"
+    ".inst 0x448143cb  // smlalb z11.s, p4/M, z30.h, z1.h\n"
+    "ldr x15, [x5, #0x80]\n"
+    ".inst 0x448147d0  // smlalt z16.s, p4/M, z30.h, z1.h\n"
+    "ld1sb { z30.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x455113de  // ssublb z30.h, z30.b, z17.b\n"
+    ".inst 0x44814373  // smlalb z19.s, p4/M, z27.h, z1.h\n"
+    "ldr x21, [x5, #0x88]\n"
+    ".inst 0x44814769  // smlalt z9.s, p4/M, z27.h, z1.h\n"
+    "ldr x20, [x5, #0x90]\n"
+    ".inst 0x44814387  // smlalb z7.s, p4/M, z28.h, z1.h\n"
+    "ldr x19, [x5, #0x98]\n"
+    ".inst 0x44814786  // smlalt z6.s, p4/M, z28.h, z1.h\n"
+    "ldr x14, [x5, #0xa0]\n"
+    ".inst 0x448142ec  // smlalb z12.s, p4/M, z23.h, z1.h\n"
+    "ldr x13, [x5, #0xa8]\n"
+    ".inst 0x448146e8  // smlalt z8.s, p4/M, z23.h, z1.h\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x454d1021  // ssublb z1.h, z1.b, z13.b\n"
+    ".inst 0x4482436b  // smlalb z11.s, p4/M, z27.h, z2.h\n"
+    "ldr x12, [x5, #0xb0]\n"
+    ".inst 0x44824770  // smlalt z16.s, p4/M, z27.h, z2.h\n"
+    "ld1sb { z27.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x4551137b  // ssublb z27.h, z27.b, z17.b\n"
+    ".inst 0x44824333  // smlalb z19.s, p4/M, z25.h, z2.h\n"
+    "ldr x11, [x5, #0xb8]\n"
+    ".inst 0x44824729  // smlalt z9.s, p4/M, z25.h, z2.h\n"
+    "ldr x10, [x5, #0xc0]\n"
+    ".inst 0x448242e7  // smlalb z7.s, p4/M, z23.h, z2.h\n"
+    "ldr x9, [x5, #0xc8]\n"
+    ".inst 0x448246e6  // smlalt z6.s, p4/M, z23.h, z2.h\n"
+    "ldr x28, [x5, #0xd0]\n"
+    ".inst 0x448243ec  // smlalb z12.s, p4/M, z31.h, z2.h\n"
+    "ldr x27, [x5, #0xd8]\n"
+    ".inst 0x448247e8  // smlalt z8.s, p4/M, z31.h, z2.h\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    ".inst 0x4483432b  // smlalb z11.s, p4/M, z25.h, z3.h\n"
+    "ldr x26, [x5, #0xe0]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    ".inst 0x44834730  // smlalt z16.s, p4/M, z25.h, z3.h\n"
+    "ld1sb { z25.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x44834313  // smlalb z19.s, p4/M, z24.h, z3.h\n"
+    "ldr x25, [x5, #0xe8]\n"
+    ".inst 0x45511339  // ssublb z25.h, z25.b, z17.b\n"
+    ".inst 0x44834709  // smlalt z9.s, p4/M, z24.h, z3.h\n"
+    "ld1w { z18.s }, p2/Z, [x4]\n"
+    ".inst 0x448343e7  // smlalb z7.s, p4/M, z31.h, z3.h\n"
+    "ld1w { z20.s }, p1/Z, [x4, #1, MUL VL]\n"
+    "addvl x4, x4, #2\n"
+    ".inst 0x448347e6  // smlalt z6.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x448343cc  // smlalb z12.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x448347c8  // smlalt z8.s, p4/M, z30.h, z3.h\n"
+    "ld1sb { z3.h }, p4/Z, [x1]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    "uzp1 z21.s, z18.s, z20.s\n"
+    "uzp2 z10.s, z18.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x6]\n"
+    ".inst 0x4484430b  // smlalb z11.s, p4/M, z24.h, z4.h\n"
+    "ld1w { z20.s }, p1/Z, [x6, #1, MUL VL]\n"
+    "addvl x6, x6, #2\n"
+    ".inst 0x44844710  // smlalt z16.s, p4/M, z24.h, z4.h\n"
+    "ld1sb { z24.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x45511318  // ssublb z24.h, z24.b, z17.b\n"
+    ".inst 0x44844373  // smlalb z19.s, p4/M, z27.h, z4.h\n"
+    "ldr x24, [x5, #0xf0]\n"
+    ".inst 0x44844769  // smlalt z9.s, p4/M, z27.h, z4.h\n"
+    "ld1sb { z27.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x4551137b  // ssublb z27.h, z27.b, z17.b\n"
+    ".inst 0x448443c7  // smlalb z7.s, p4/M, z30.h, z4.h\n"
+    "ldr x23, [x5, #0xf8]\n"
+    ".inst 0x448447c6  // smlalt z6.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x4484434c  // smlalb z12.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x44844748  // smlalt z8.s, p4/M, z26.h, z4.h\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    ".inst 0x448043ab  // smlalb z11.s, p4/M, z29.h, z0.h\n"
+    ".inst 0x448047b0  // smlalt z16.s, p4/M, z29.h, z0.h\n"
+    "uzp1 z29.s, z18.s, z20.s\n"
+    "uzp2 z20.s, z18.s, z20.s\n"
+    ".inst 0x44804393  // smlalb z19.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x44804789  // smlalt z9.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x448042c7  // smlalb z7.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x448046c6  // smlalt z6.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x4480432c  // smlalb z12.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804728  // smlalt z8.s, p4/M, z25.h, z0.h\n"
+    "ld1sb { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x454d1000  // ssublb z0.h, z0.b, z13.b\n"
+    ".inst 0x4481438b  // smlalb z11.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814790  // smlalt z16.s, p4/M, z28.h, z1.h\n"
+    "ld1sb { z28.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x4551139c  // ssublb z28.h, z28.b, z17.b\n"
+    ".inst 0x448142f3  // smlalb z19.s, p4/M, z23.h, z1.h\n"
+    "ldr x22, [x5, #0x100]\n"
+    ".inst 0x448146e9  // smlalt z9.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x44814327  // smlalb z7.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x44814726  // smlalt z6.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x4481430c  // smlalb z12.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x44814708  // smlalt z8.s, p4/M, z24.h, z1.h\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
+    ".inst 0x454d1021  // ssublb z1.h, z1.b, z13.b\n"
+    ".inst 0x448242eb  // smlalb z11.s, p4/M, z23.h, z2.h\n"
+    ".inst 0x448246f0  // smlalt z16.s, p4/M, z23.h, z2.h\n"
+    "ld1sb { z23.h }, p3/Z, [x15, x2]\n"
+    ".inst 0x455112f7  // ssublb z23.h, z23.b, z17.b\n"
+    ".inst 0x448243f3  // smlalb z19.s, p4/M, z31.h, z2.h\n"
+    "ldr x21, [x5, #0x108]\n"
+    ".inst 0x448247e9  // smlalt z9.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x44824307  // smlalb z7.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x44824706  // smlalt z6.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x4482436c  // smlalb z12.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824768  // smlalt z8.s, p4/M, z27.h, z2.h\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    ".inst 0x448343eb  // smlalb z11.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x448347f0  // smlalt z16.s, p4/M, z31.h, z3.h\n"
+    "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455113ff  // ssublb z31.h, z31.b, z17.b\n"
+    ".inst 0x448343d3  // smlalb z19.s, p4/M, z30.h, z3.h\n"
+    "ldr x20, [x5, #0x110]\n"
+    ".inst 0x448347c9  // smlalt z9.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x44834367  // smlalb z7.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834766  // smlalt z6.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x448342ec  // smlalb z12.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x448346e8  // smlalt z8.s, p4/M, z23.h, z3.h\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    ".inst 0x448443cb  // smlalb z11.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x448447d0  // smlalt z16.s, p4/M, z30.h, z4.h\n"
+    "ld1sb { z30.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x455113de  // ssublb z30.h, z30.b, z17.b\n"
+    ".inst 0x44844353  // smlalb z19.s, p4/M, z26.h, z4.h\n"
+    "ldr x19, [x5, #0x118]\n"
+    ".inst 0x44844749  // smlalt z9.s, p4/M, z26.h, z4.h\n"
+    "ld1sb { z26.h }, p3/Z, [x14, x2]\n"
+    ".inst 0x4551135a  // ssublb z26.h, z26.b, z17.b\n"
+    ".inst 0x448442e7  // smlalb z7.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x448446e6  // smlalt z6.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x4484438c  // smlalb z12.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844788  // smlalt z8.s, p4/M, z28.h, z4.h\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    ".inst 0x448042cb  // smlalb z11.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x448046d0  // smlalt z16.s, p4/M, z22.h, z0.h\n"
+    "ld1sb { z22.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x455112d6  // ssublb z22.h, z22.b, z17.b\n"
+    ".inst 0x44804333  // smlalb z19.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804729  // smlalt z9.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x448043e7  // smlalb z7.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448047e6  // smlalt z6.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448043cc  // smlalb z12.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x448047c8  // smlalt z8.s, p4/M, z30.h, z0.h\n"
+    "ld1sb { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    ".inst 0x4481432b  // smlalb z11.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x454d1000  // ssublb z0.h, z0.b, z13.b\n"
+    ".inst 0x44814730  // smlalt z16.s, p4/M, z25.h, z1.h\n"
+    "ld1sb { z25.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x44814313  // smlalb z19.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x45511339  // ssublb z25.h, z25.b, z17.b\n"
+    ".inst 0x44814709  // smlalt z9.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x448143c7  // smlalb z7.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x448147c6  // smlalt z6.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x4481434c  // smlalb z12.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x44814748  // smlalt z8.s, p4/M, z26.h, z1.h\n"
+    "ld1sb { z1.h }, p4/Z, [x1]\n"
+    ".inst 0x454d1021  // ssublb z1.h, z1.b, z13.b\n"
+    ".inst 0x4482430b  // smlalb z11.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x44824710  // smlalt z16.s, p4/M, z24.h, z2.h\n"
+    "ld1sb { z24.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x45511318  // ssublb z24.h, z24.b, z17.b\n"
+    ".inst 0x44824373  // smlalb z19.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824769  // smlalt z9.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824347  // smlalb z7.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824746  // smlalt z6.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x4482432c  // smlalb z12.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x44824728  // smlalt z8.s, p4/M, z25.h, z2.h\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    ".inst 0x4483436b  // smlalb z11.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834770  // smlalt z16.s, p4/M, z27.h, z3.h\n"
+    "ld1sb { z27.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x4551137b  // ssublb z27.h, z27.b, z17.b\n"
+    ".inst 0x448342f3  // smlalb z19.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x448346e9  // smlalt z9.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x44834327  // smlalb z7.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834726  // smlalt z6.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x4483430c  // smlalb z12.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834708  // smlalt z8.s, p4/M, z24.h, z3.h\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    ".inst 0x448442eb  // smlalb z11.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x448446f0  // smlalt z16.s, p4/M, z23.h, z4.h\n"
+    "ld1sb { z23.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x455112f7  // ssublb z23.h, z23.b, z17.b\n"
+    ".inst 0x44844393  // smlalb z19.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844789  // smlalt z9.s, p4/M, z28.h, z4.h\n"
+    "ld1sb { z28.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x4551139c  // ssublb z28.h, z28.b, z17.b\n"
+    ".inst 0x44844307  // smlalb z7.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x44844706  // smlalt z6.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x448442cc  // smlalb z12.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x448446c8  // smlalt z8.s, p4/M, z22.h, z4.h\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    ".inst 0x448043eb  // smlalb z11.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448047f0  // smlalt z16.s, p4/M, z31.h, z0.h\n"
+    "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455113ff  // ssublb z31.h, z31.b, z17.b\n"
+    ".inst 0x448043d3  // smlalb z19.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x448047c9  // smlalt z9.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x44804367  // smlalb z7.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x44804766  // smlalt z6.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x448042ec  // smlalb z12.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x448046e8  // smlalt z8.s, p4/M, z23.h, z0.h\n"
+    "ld1sb { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x454d1000  // ssublb z0.h, z0.b, z13.b\n"
+    ".inst 0x448143cb  // smlalb z11.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x448147d0  // smlalt z16.s, p4/M, z30.h, z1.h\n"
+    "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x455113de  // ssublb z30.h, z30.b, z17.b\n"
+    ".inst 0x44814353  // smlalb z19.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x44814749  // smlalt z9.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x448142e7  // smlalb z7.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448146e6  // smlalt z6.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448143ec  // smlalb z12.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147e8  // smlalt z8.s, p4/M, z31.h, z1.h\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x454d1021  // ssublb z1.h, z1.b, z13.b\n"
+    ".inst 0x4482434b  // smlalb z11.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824750  // smlalt z16.s, p4/M, z26.h, z2.h\n"
+    "ld1sb { z26.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x4551135a  // ssublb z26.h, z26.b, z17.b\n"
+    ".inst 0x44824333  // smlalb z19.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x44824729  // smlalt z9.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x448243e7  // smlalb z7.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448247e6  // smlalt z6.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448243cc  // smlalb z12.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x448247c8  // smlalt z8.s, p4/M, z30.h, z2.h\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    ".inst 0x4483432b  // smlalb z11.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834730  // smlalt z16.s, p4/M, z25.h, z3.h\n"
+    "ld1sb { z25.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x45511339  // ssublb z25.h, z25.b, z17.b\n"
+    ".inst 0x44834313  // smlalb z19.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834709  // smlalt z9.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x448343c7  // smlalb z7.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x448347c6  // smlalt z6.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x4483438c  // smlalb z12.s, p4/M, z28.h, z3.h\n"
+    ".inst 0x44834788  // smlalt z8.s, p4/M, z28.h, z3.h\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    ".inst 0x4484430b  // smlalb z11.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    ".inst 0x44844710  // smlalt z16.s, p4/M, z24.h, z4.h\n"
+    "ld1sb { z24.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x448442d3  // smlalb z19.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x45511318  // ssublb z24.h, z24.b, z17.b\n"
+    ".inst 0x448446c9  // smlalt z9.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x44844387  // smlalb z7.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844786  // smlalt z6.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x4484434c  // smlalb z12.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x44844748  // smlalt z8.s, p4/M, z26.h, z4.h\n"
+    "ld1sb { z4.h }, p4/Z, [x1]\n"
+    "inch x1\n"
+    ".inst 0x4480436b  // smlalb z11.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    ".inst 0x44804770  // smlalt z16.s, p4/M, z27.h, z0.h\n"
+    "ld1sb { z27.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x448042f3  // smlalb z19.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x4551137b  // ssublb z27.h, z27.b, z17.b\n"
+    ".inst 0x448046e9  // smlalt z9.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x44804327  // smlalb z7.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804726  // smlalt z6.s, p4/M, z25.h, z0.h\n"
+    "ld1sb { z25.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x45511339  // ssublb z25.h, z25.b, z17.b\n"
+    ".inst 0x4480430c  // smlalb z12.s, p4/M, z24.h, z0.h\n"
+    ".inst 0x44804708  // smlalt z8.s, p4/M, z24.h, z0.h\n"
+    ".inst 0x448142eb  // smlalb z11.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448146f0  // smlalt z16.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448143f3  // smlalb z19.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147e9  // smlalt z9.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x44814307  // smlalb z7.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x44814706  // smlalt z6.s, p4/M, z24.h, z1.h\n"
+    "ld1sb { z24.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x45511318  // ssublb z24.h, z24.b, z17.b\n"
+    ".inst 0x4481436c  // smlalb z12.s, p4/M, z27.h, z1.h\n"
+    ".inst 0x44814768  // smlalt z8.s, p4/M, z27.h, z1.h\n"
+    ".inst 0x448243eb  // smlalb z11.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448247f0  // smlalt z16.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448243d3  // smlalb z19.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x448247c9  // smlalt z9.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824367  // smlalb z7.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824766  // smlalt z6.s, p4/M, z27.h, z2.h\n"
+    "ld1sb { z27.h }, p3/Z, [x19, x2]\n"
+    "inch x2\n"
+    ".inst 0x4482432c  // smlalb z12.s, p4/M, z25.h, z2.h\n"
+    "whilelt p2.s, x2, x0\n"
+    ".inst 0x44824728  // smlalt z8.s, p4/M, z25.h, z2.h\n"
+    "mov x19, x2\n"
+    ".inst 0x448343cb  // smlalb z11.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x4551137b  // ssublb z27.h, z27.b, z17.b\n"
+    ".inst 0x448347d0  // smlalt z16.s, p4/M, z30.h, z3.h\n"
+    "incw x19\n"
+    ".inst 0x44834393  // smlalb z19.s, p4/M, z28.h, z3.h\n"
+    "whilelt p1.s, x19, x0\n"
+    ".inst 0x44834789  // smlalt z9.s, p4/M, z28.h, z3.h\n"
+    "whilelt p3.h, x2, x0\n"
+    ".inst 0x44834327  // smlalb z7.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834726  // smlalt z6.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x4483430c  // smlalb z12.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834708  // smlalt z8.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x4484438b  // smlalb z11.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844790  // smlalt z16.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844353  // smlalb z19.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x44844749  // smlalt z9.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x04b5756b  // sqrdmulh z11.s, z11.s, z21.s\n"
+    ".inst 0x04aa7610  // sqrdmulh z16.s, z16.s, z10.s\n"
+    ".inst 0x04b57673  // sqrdmulh z19.s, z19.s, z21.s\n"
+    ".inst 0x04aa7529  // sqrdmulh z9.s, z9.s, z10.s\n"
+    "and z31.d, z11.d, z29.d\n"
+    "asr z31.s, z31.s, #0x1f\n"
+    "and z23.d, z16.d, z20.d\n"
+    "and z25.d, z19.d, z29.d\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "and z18.d, z9.d, z20.d\n"
+    ".inst 0x44844307  // smlalb z7.s, p4/M, z24.h, z4.h\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    ".inst 0x44844706  // smlalt z6.s, p4/M, z24.h, z4.h\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z11.s, z11.s, z31.s\n"
+    ".inst 0x4484436c  // smlalb z12.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x04b574e7  // sqrdmulh z7.s, z7.s, z21.s\n"
+    "sqadd z16.s, z16.s, z23.s\n"
+    "sqadd z19.s, z19.s, z25.s\n"
+    ".inst 0x04aa74c6  // sqrdmulh z6.s, z6.s, z10.s\n"
+    "sqadd z9.s, z9.s, z18.s\n"
+    "and z1.d, z7.d, z29.d\n"
+    "asr z1.s, z1.s, #0x1f\n"
+    "and z18.d, z6.d, z20.d\n"
+    ".inst 0x04b5758c  // sqrdmulh z12.s, z12.s, z21.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x44844768  // smlalt z8.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x448293ab  // srshl z11.s, p4/M, z11.s, z29.s\n"
+    "and z30.d, z12.d, z29.d\n"
+    "asr z30.s, z30.s, #0x1f\n"
+    "add z11.s, z11.s, z14.s\n"
+    "sqadd z7.s, z7.s, z1.s\n"
+    "sqadd z6.s, z6.s, z18.s\n"
+    ".inst 0x04aa7508  // sqrdmulh z8.s, z8.s, z10.s\n"
+    "smin z11.s, p4/M, z11.s, z15.s\n"
+    ".inst 0x44829290  // srshl z16.s, p4/M, z16.s, z20.s\n"
+    "sqadd z12.s, z12.s, z30.s\n"
+    "and z3.d, z8.d, z20.d\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    "add z16.s, z16.s, z14.s\n"
+    "smax z11.s, p4/M, z11.s, z5.s\n"
+    ".inst 0x448293b3  // srshl z19.s, p4/M, z19.s, z29.s\n"
+    ".inst 0x44829289  // srshl z9.s, p4/M, z9.s, z20.s\n"
+    "smin z16.s, p4/M, z16.s, z15.s\n"
+    ".inst 0x448293a7  // srshl z7.s, p4/M, z7.s, z29.s\n"
+    "add z19.s, z19.s, z14.s\n"
+    "add z9.s, z9.s, z14.s\n"
+    "sqadd z8.s, z8.s, z3.s\n"
+    "add z7.s, z7.s, z14.s\n"
+    "smax z16.s, p4/M, z16.s, z5.s\n"
+    "smin z19.s, p4/M, z19.s, z15.s\n"
+    "smin z9.s, p4/M, z9.s, z15.s\n"
+    "smin z7.s, p4/M, z7.s, z15.s\n"
+    "trn1 z11.h, z11.h, z16.h\n"
+    "st1b { z11.h }, p0, [x7, x3]\n"
+    "smax z19.s, p4/M, z19.s, z5.s\n"
+    "smax z9.s, p4/M, z9.s, z5.s\n"
+    "smax z7.s, p4/M, z7.s, z5.s\n"
+    ".inst 0x44829286  // srshl z6.s, p4/M, z6.s, z20.s\n"
+    ".inst 0x448293ac  // srshl z12.s, p4/M, z12.s, z29.s\n"
+    "trn1 z19.h, z19.h, z9.h\n"
+    "st1b { z19.h }, p0, [x8, x3]\n"
+    "add z6.s, z6.s, z14.s\n"
+    ".inst 0x44829288  // srshl z8.s, p4/M, z8.s, z20.s\n"
+    "add z12.s, z12.s, z14.s\n"
+    "smin z6.s, p4/M, z6.s, z15.s\n"
+    "add z8.s, z8.s, z14.s\n"
+    "smin z12.s, p4/M, z12.s, z15.s\n"
+    "smax z6.s, p4/M, z6.s, z5.s\n"
+    "smin z8.s, p4/M, z8.s, z15.s\n"
+    "smax z12.s, p4/M, z12.s, z5.s\n"
+    "trn1 z7.h, z7.h, z6.h\n"
+    "st1b { z7.h }, p0, [x17, x3]\n"
+    "smax z8.s, p4/M, z8.s, z5.s\n"
+    "trn1 z12.h, z12.h, z8.h\n"
+    "st1b { z12.h }, p0, [x16, x3]\n"
+    "inch x3\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z19.s }, p2/Z, [x19]\n"
+    "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z11.s, z19.s, z6.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z16.s, z19.s, z6.s\n"
+    "mov z19.d, z11.d\n"
+    "ld1sb { z0.h }, p4/Z, [x1]\n"
+    ".inst 0x454d1000  // ssublb z0.h, z0.b, z13.b\n"
+    "mov z9.d, z16.d\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+    "mov z7.d, z11.d\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x454d1021  // ssublb z1.h, z1.b, z13.b\n"
+    "mov z6.d, z16.d\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+    "mov z12.d, z11.d\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x454d1042  // ssublb z2.h, z2.b, z13.b\n"
+    "mov z8.d, z16.d\n"
+    "ldp x28, x27, [x5, #0x0]\n"
+    ".inst 0x454d1063  // ssublb z3.h, z3.b, z13.b\n"
+    "ldp x26, x25, [x5, #0x10]\n"
+    ".inst 0x454d1084  // ssublb z4.h, z4.b, z13.b\n"
+    "ldp x24, x23, [x5, #0x20]\n"
+    "ldp x22, x21, [x5, #0x30]\n"
+    "ldp x20, x19, [x5, #0x40]\n"
+    "ld1sb { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x455113ff  // ssublb z31.h, z31.b, z17.b\n"
+    "ld1sb { z30.h }, p3/Z, [x27, x2]\n"
+    "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x455113de  // ssublb z30.h, z30.b, z17.b\n"
+    "ld1sb { z28.h }, p3/Z, [x25, x2]\n"
+    "ld1sb { z27.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x455113bd  // ssublb z29.h, z29.b, z17.b\n"
+    "ld1sb { z23.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x4551139c  // ssublb z28.h, z28.b, z17.b\n"
+    "ld1sb { z25.h }, p3/Z, [x22, x2]\n"
+    "ld1sb { z24.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x4551137b  // ssublb z27.h, z27.b, z17.b\n"
+    "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x455112f7  // ssublb z23.h, z23.b, z17.b\n"
+    "ld1sb { z22.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x45511339  // ssublb z25.h, z25.b, z17.b\n"
+    ".inst 0x45511318  // ssublb z24.h, z24.b, z17.b\n"
+    ".inst 0x4551135a  // ssublb z26.h, z26.b, z17.b\n"
+    ".inst 0x455112d6  // ssublb z22.h, z22.b, z17.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000..7ab83e8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 9;
+  constexpr static unsigned int input_col_quads = 1;
+
+  kern_type kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+
+  sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..f531912
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp

@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov z31.s, #0x0\n"
+    "ldr x24, [%x[inptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "mov z18.s, #0x0\n"
+    "ldr x23, [%x[inptrs], #0x8]\n"
+    "lsl x9, %x[n_channels], #0x2\n"
+    "mov z29.s, #0x0\n"
+    "ldr x22, [%x[inptrs], #0x10]\n"
+    "addvl SP, SP, #-8\n"
+    "mov z28.s, #0x0\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "mov x19, #0x9\n"
+    "mov z13.s, #0x0\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "whilelt p1.b, XZR, x19\n"
+    "mov z14.s, #0x0\n"
+    "ld1b { z7.b }, p1/Z, [x24]\n"
+    "mov x19, #0x3\n"
+    "mov z15.s, #0x0\n"
+    "ld1b { z3.b }, p1/Z, [x23]\n"
+    "whilelt p0.b, XZR, x19\n"
+    "mov z11.b, p0/z, #0x1\n"
+    "ld1b { z4.b }, p1/Z, [x22]\n"
+    "mov x28, #0x0\n"
+    "mov z10.d, z7.d\n"
+    "ld1b { z6.b }, p1/Z, [x21]\n"
+    "mov x27, #0x0\n"
+    "ext z10.b, z10.b, z10.b, #0x2\n"
+    "ld1b { z5.b }, p1/Z, [x20]\n"
+    "whilelt p1.b, x28, x9\n"
+    "mov z17.d, z7.d\n"
+    "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z26.d, z7.d\n"
+    "ldp x26, x25, [%x[outptrs], #0x0]\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "ldp x24, x23, [%x[outptrs], #0x10]\n"
+    "ext z26.b, z26.b, z26.b, #0x6\n"
+    "ldp x22, x21, [%x[outptrs], #0x20]\n"
+    "mov z19.d, z3.d\n"
+    "ldp x20, x19, [%x[outptrs], #0x30]\n"
+    "ext z19.b, z19.b, z19.b, #0x2\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip1 z7.s, z7.s, z17.s\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "zip1 z10.s, z10.s, z26.s\n"
+    "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "zip1 z7.s, z7.s, z10.s\n"
+    "ld1w { z1.s }, p1/Z, [%x[params]]\n"
+    "mov z7.q, z7.q[0]\n"
+    "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "mov z17.d, z3.d\n"
+    "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "addvl %x[params], %x[params], #4\n"
+    "mov z2.d, z3.d\n"
+    "mov z20.d, z4.d\n"
+    "ext z2.b, z2.b, z2.b, #0x6\n"
+    "zip1 z3.s, z3.s, z17.s\n"
+    "ext z20.b, z20.b, z20.b, #0x2\n"
+    "mov z17.d, z4.d\n"
+    "zip1 z19.s, z19.s, z2.s\n"
+    "zip1 z3.s, z3.s, z19.s\n"
+    "mov z3.q, z3.q[0]\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "mov z26.d, z4.d\n"
+    "ext z26.b, z26.b, z26.b, #0x6\n"
+    "mov z21.d, z6.d\n"
+    "zip1 z4.s, z4.s, z17.s\n"
+    "ext z21.b, z21.b, z21.b, #0x2\n"
+    "zip1 z20.s, z20.s, z26.s\n"
+    "zip1 z4.s, z4.s, z20.s\n"
+    "mov z4.q, z4.q[0]\n"
+    "mov z17.d, z6.d\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "mov z20.d, z6.d\n"
+    "ext z20.b, z20.b, z20.b, #0x6\n"
+    "mov z19.d, z5.d\n"
+    "zip1 z6.s, z6.s, z17.s\n"
+    "ext z19.b, z19.b, z19.b, #0x2\n"
+    "zip1 z21.s, z21.s, z20.s\n"
+    "zip1 z6.s, z6.s, z21.s\n"
+    "mov z6.q, z6.q[0]\n"
+    "mov z17.d, z5.d\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "mov z20.d, z5.d\n"
+    "ext z20.b, z20.b, z20.b, #0x6\n"
+    "mov z11.s, z11.s[0]\n"
+    "zip1 z5.s, z5.s, z17.s\n"
+    "mov z25.s, #0x0\n"
+    "zip1 z19.s, z19.s, z20.s\n"
+    "zip1 z5.s, z5.s, z19.s\n"
+    "mov z5.q, z5.q[0]\n"
+    "mov z26.s, #0x0\n"
+    "mov z27.s, #0x0\n"
+    "mov z24.s, #0x0\n"
+    "mov z23.s, #0x0\n"
+    "mov z22.s, #0x0\n"
+    "mov z21.s, #0x0\n"
+    "mov z17.s, #0x0\n"
+    "mov z20.s, #0x0\n"
+    "mov z2.s, #0x0\n"
+    "mov z19.s, #0x0\n"
+    "sdot z31.s, z11.b, z7.b[0]\n"
+    "sdot z18.s, z11.b, z7.b[1]\n"
+    "sdot z29.s, z11.b, z7.b[2]\n"
+    "sdot z28.s, z11.b, z7.b[3]\n"
+    "sdot z13.s, z11.b, z3.b[0]\n"
+    "sdot z14.s, z11.b, z3.b[1]\n"
+    "sdot z15.s, z11.b, z3.b[2]\n"
+    "sdot z25.s, z11.b, z3.b[3]\n"
+    "sdot z26.s, z11.b, z4.b[0]\n"
+    "sdot z27.s, z11.b, z4.b[1]\n"
+    "sdot z24.s, z11.b, z4.b[2]\n"
+    "sdot z23.s, z11.b, z4.b[3]\n"
+    "sdot z22.s, z11.b, z6.b[0]\n"
+    "sdot z21.s, z11.b, z6.b[1]\n"
+    "sdot z17.s, z11.b, z6.b[2]\n"
+    "sdot z20.s, z11.b, z6.b[3]\n"
+    "sdot z2.s, z11.b, z5.b[0]\n"
+    "sdot z19.s, z11.b, z5.b[1]\n"
+    "mov z31.d, z31.d\n"
+    "mov z18.d, z18.d\n"
+    "mov z29.d, z29.d\n"
+    "mov z28.d, z28.d\n"
+    "add z31.s, z31.s, z13.s\n"
+    "mov z13.s, #0x0\n"
+    "sdot z13.s, z11.b, z5.b[2]\n"
+    "add z18.s, z18.s, z14.s\n"
+    "mov z14.s, #0x0\n"
+    "sdot z14.s, z11.b, z5.b[3]\n"
+    "add z29.s, z29.s, z15.s\n"
+    "add z28.s, z28.s, z25.s\n"
+    "add z31.s, z31.s, z26.s\n"
+    "add z18.s, z18.s, z27.s\n"
+    "add z29.s, z29.s, z24.s\n"
+    "add z28.s, z28.s, z23.s\n"
+    "mov z26.d, z26.d\n"
+    "mov z25.d, z27.d\n"
+    "mov z24.d, z24.d\n"
+    "mov z23.d, z23.d\n"
+    "add z26.s, z26.s, z22.s\n"
+    "add z25.s, z25.s, z21.s\n"
+    "add z24.s, z24.s, z17.s\n"
+    "add z23.s, z23.s, z20.s\n"
+    "add z26.s, z26.s, z2.s\n"
+    "add z25.s, z25.s, z19.s\n"
+    "add z24.s, z24.s, z13.s\n"
+    "add z23.s, z23.s, z14.s\n"
+    "neg z30.s, p2/M, z30.s\n"
+    "mul z31.s, p2/M, z31.s, z30.s\n"
+    "st1w { z31.s }, p2, [SP]\n"
+    "add z31.s, z31.s, z1.s\n"
+    "mul z18.s, p2/M, z18.s, z30.s\n"
+    "st1w { z18.s }, p2, [SP, #1, MUL VL]\n"
+    "add z18.s, z18.s, z1.s\n"
+    "mul z29.s, p2/M, z29.s, z30.s\n"
+    "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
+    "add z29.s, z29.s, z1.s\n"
+    "mul z28.s, p2/M, z28.s, z30.s\n"
+    "st1w { z28.s }, p2, [SP, #3, MUL VL]\n"
+    "add z28.s, z28.s, z1.s\n"
+    "mul z26.s, p2/M, z26.s, z30.s\n"
+    "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
+    "add z26.s, z26.s, z1.s\n"
+    "mul z25.s, p2/M, z25.s, z30.s\n"
+    "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
+    "add z25.s, z25.s, z1.s\n"
+    "mul z24.s, p2/M, z24.s, z30.s\n"
+    "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
+    "add z24.s, z24.s, z1.s\n"
+    "mul z23.s, p2/M, z23.s, z30.s\n"
+    "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
+    "add z23.s, z23.s, z1.s\n"
+    "1:"  // Loop
+    "sdot z31.s, z8.b, z7.b[0]\n"
+    "ld1w { z22.s }, p2/Z, [%x[params]]\n"
+    "incb x28\n"
+    "sdot z18.s, z8.b, z7.b[1]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "whilelt p0.s, x27, %x[n_channels]\n"
+    "sdot z29.s, z8.b, z7.b[2]\n"
+    "whilelt p1.b, x28, x9\n"
+    "ld1w { z1.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "sdot z28.s, z8.b, z7.b[3]\n"
+    "sdot z26.s, z8.b, z4.b[0]\n"
+    "sdot z25.s, z8.b, z4.b[1]\n"
+    "sdot z24.s, z8.b, z4.b[2]\n"
+    "sdot z23.s, z8.b, z4.b[3]\n"
+    "ld1b { z8.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z31.s, z9.b, z3.b[0]\n"
+    "sdot z18.s, z9.b, z3.b[1]\n"
+    "sdot z29.s, z9.b, z3.b[2]\n"
+    "sdot z28.s, z9.b, z3.b[3]\n"
+    "sdot z26.s, z9.b, z6.b[0]\n"
+    "sdot z25.s, z9.b, z6.b[1]\n"
+    "sdot z24.s, z9.b, z6.b[2]\n"
+    "sdot z23.s, z9.b, z6.b[3]\n"
+    "ld1b { z9.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "sdot z31.s, z10.b, z4.b[0]\n"
+    "sdot z18.s, z10.b, z4.b[1]\n"
+    "sdot z29.s, z10.b, z4.b[2]\n"
+    "sdot z28.s, z10.b, z4.b[3]\n"
+    "sdot z26.s, z10.b, z5.b[0]\n"
+    "sdot z25.s, z10.b, z5.b[1]\n"
+    "sdot z24.s, z10.b, z5.b[2]\n"
+    "sdot z23.s, z10.b, z5.b[3]\n"
+    "ld1b { z10.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #6\n"
+    ".inst 0x04b677ff  // sqrdmulh z31.s, z31.s, z22.s\n"
+    ".inst 0x04b67652  // sqrdmulh z18.s, z18.s, z22.s\n"
+    ".inst 0x04b677bd  // sqrdmulh z29.s, z29.s, z22.s\n"
+    ".inst 0x04b6779c  // sqrdmulh z28.s, z28.s, z22.s\n"
+    ".inst 0x04b6775a  // sqrdmulh z26.s, z26.s, z22.s\n"
+    "and z20.d, z31.d, z21.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z19.d, z18.d, z21.d\n"
+    "and z14.d, z29.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z17.d, z28.d, z21.d\n"
+    "and z2.d, z26.d, z21.d\n"
+    "asr z14.s, z14.s, #0x1f\n"
+    ".inst 0x04b67739  // sqrdmulh z25.s, z25.s, z22.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sqadd z31.s, z31.s, z20.s\n"
+    ".inst 0x04b67718  // sqrdmulh z24.s, z24.s, z22.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x04b676f7  // sqrdmulh z23.s, z23.s, z22.s\n"
+    "sqadd z18.s, z18.s, z19.s\n"
+    "sqadd z29.s, z29.s, z14.s\n"
+    "and z27.d, z25.d, z21.d\n"
+    "asr z27.s, z27.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z17.s\n"
+    "sqadd z26.s, z26.s, z2.s\n"
+    "and z17.d, z24.d, z21.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "and z15.d, z23.d, z21.d\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "asr z15.s, z15.s, #0x1f\n"
+    "sqadd z25.s, z25.s, z27.s\n"
+    ".inst 0x44828ab2  // srshl z18.s, p2/M, z18.s, z21.s\n"
+    "add z31.s, z31.s, z12.s\n"
+    "sqadd z24.s, z24.s, z17.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "add z18.s, z18.s, z12.s\n"
+    "sqadd z23.s, z23.s, z15.s\n"
+    "smin z31.s, p2/M, z31.s, z0.s\n"
+    "add z29.s, z29.s, z12.s\n"
+    "smin z18.s, p2/M, z18.s, z0.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    "smax z31.s, p2/M, z31.s, z16.s\n"
+    "st1b { z31.s }, p0, [x26, x27]\n"
+    "add z28.s, z28.s, z12.s\n"
+    "smax z18.s, p2/M, z18.s, z16.s\n"
+    "ld1w { z31.s }, p2/Z, [SP]\n"
+    "smin z29.s, p2/M, z29.s, z0.s\n"
+    "st1b { z18.s }, p0, [x25, x27]\n"
+    "add z31.s, z31.s, z1.s\n"
+    "smin z28.s, p2/M, z28.s, z0.s\n"
+    "ld1w { z18.s }, p2/Z, [SP, #1, MUL VL]\n"
+    "smax z29.s, p2/M, z29.s, z16.s\n"
+    "st1b { z29.s }, p0, [x24, x27]\n"
+    "add z18.s, z18.s, z1.s\n"
+    "smax z28.s, p2/M, z28.s, z16.s\n"
+    "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    "st1b { z28.s }, p0, [x23, x27]\n"
+    "add z29.s, z29.s, z1.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "ld1w { z28.s }, p2/Z, [SP, #3, MUL VL]\n"
+    "add z26.s, z26.s, z12.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab7  // srshl z23.s, p2/M, z23.s, z21.s\n"
+    "add z25.s, z25.s, z12.s\n"
+    "add z28.s, z28.s, z1.s\n"
+    "add z24.s, z24.s, z12.s\n"
+    "add z23.s, z23.s, z12.s\n"
+    "smin z26.s, p2/M, z26.s, z0.s\n"
+    "smin z25.s, p2/M, z25.s, z0.s\n"
+    "smin z24.s, p2/M, z24.s, z0.s\n"
+    "smin z23.s, p2/M, z23.s, z0.s\n"
+    "smax z26.s, p2/M, z26.s, z16.s\n"
+    "st1b { z26.s }, p0, [x22, x27]\n"
+    "smax z25.s, p2/M, z25.s, z16.s\n"
+    "smax z24.s, p2/M, z24.s, z16.s\n"
+    "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
+    "smax z23.s, p2/M, z23.s, z16.s\n"
+    "st1b { z25.s }, p0, [x21, x27]\n"
+    "add z26.s, z26.s, z1.s\n"
+    "st1b { z24.s }, p0, [x20, x27]\n"
+    "st1b { z23.s }, p0, [x19, x27]\n"
+    "incw x27\n"
+    "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
+    "add z25.s, z25.s, z1.s\n"
+    "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
+    "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
+    "add z24.s, z24.s, z1.s\n"
+    "add z23.s, z23.s, z1.s\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #8\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..2c33bdc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 8;
+  constexpr static unsigned int input_cols = 6;
+  constexpr static unsigned int input_col_quads = 1;
+
+  kern_type kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+
+  sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..ffa2c6a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const int8_t *const *const inptrs,
+  int8_t *const *const outptrs,
+  const void *params,
+  const unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov z20.b, #0x1\n"
+    "ldr x24, [%x[inptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "mov z22.s, #0x1\n"
+    "ldr x23, [%x[inptrs], #0x8]\n"
+    "lsl x9, %x[n_channels], #0x2\n"
+    "mov z30.s, #0x0\n"
+    "ldr x22, [%x[inptrs], #0x10]\n"
+    "addvl SP, SP, #-8\n"
+    "mov z28.s, #0x0\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "mov x20, #0x6\n"
+    "mov z29.s, #0x0\n"
+    "ldr x19, [%x[inptrs], #0x20]\n"
+    "whilelt p0.b, XZR, x20\n"
+    "mov z27.s, #0x0\n"
+    "ld1b { z0.b }, p0/Z, [x24]\n"
+    "mov x28, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "ld1b { z3.b }, p0/Z, [x23]\n"
+    "mov x27, #0x0\n"
+    "mov z25.s, #0x0\n"
+    "ld1b { z5.b }, p0/Z, [x22]\n"
+    "whilelt p1.b, x28, x9\n"
+    "mov z15.d, z0.d\n"
+    "ld1b { z4.b }, p0/Z, [x21]\n"
+    "mov z24.s, #0x0\n"
+    "ld1b { z6.b }, p0/Z, [x19]\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "ldr x21, [%x[inptrs], #0x28]\n"
+    "mov z16.d, z3.d\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "ext z16.b, z16.b, z16.b, #0x1\n"
+    "ldr x19, [%x[inptrs], #0x38]\n"
+    "mov z18.d, z5.d\n"
+    "ld1b { z7.b }, p0/Z, [x21]\n"
+    "zip1 z0.d, z0.d, z15.d\n"
+    "ld1b { z1.b }, p0/Z, [x20]\n"
+    "mov z0.q, z0.q[0]\n"
+    "ld1b { z2.b }, p0/Z, [x19]\n"
+    "zip1 z3.d, z3.d, z16.d\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z3.q, z3.q[0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x0]\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ldp x24, x23, [%x[outptrs], #0x10]\n"
+    "mov z16.d, z4.d\n"
+    "ldp x22, x21, [%x[outptrs], #0x20]\n"
+    "ext z16.b, z16.b, z16.b, #0x1\n"
+    "ldp x20, x19, [%x[outptrs], #0x30]\n"
+    "mov z17.d, z6.d\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip1 z5.d, z5.d, z18.d\n"
+    "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "mov z5.q, z5.q[0]\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "zip1 z4.d, z4.d, z16.d\n"
+    "ld1w { z13.s }, p1/Z, [%x[params]]\n"
+    "mov z4.q, z4.q[0]\n"
+    "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "ext z17.b, z17.b, z17.b, #0x1\n"
+    "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "mov z16.d, z7.d\n"
+    "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "ext z16.b, z16.b, z16.b, #0x1\n"
+    "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "addvl %x[params], %x[params], #5\n"
+    "zip1 z6.d, z6.d, z17.d\n"
+    "mov z17.d, z1.d\n"
+    "mov z6.q, z6.q[0]\n"
+    "zip1 z7.d, z7.d, z16.d\n"
+    "mov z7.q, z7.q[0]\n"
+    "ext z17.b, z17.b, z17.b, #0x1\n"
+    "mov z16.d, z2.d\n"
+    "ext z16.b, z16.b, z16.b, #0x1\n"
+    "mov z23.s, #0x0\n"
+    "zip1 z1.d, z1.d, z17.d\n"
+    "mov z1.q, z1.q[0]\n"
+    "zip1 z2.d, z2.d, z16.d\n"
+    "mov z2.q, z2.q[0]\n"
+    "mov z18.s, #0x0\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "mov z21.s, #0x0\n"
+    "mov z19.s, #0x0\n"
+    "sdot z30.s, z20.b, z0.b[0]\n"
+    "sdot z28.s, z20.b, z0.b[2]\n"
+    "sdot z29.s, z20.b, z3.b[0]\n"
+    "sdot z27.s, z20.b, z3.b[2]\n"
+    "sdot z30.s, z22.b, z0.b[1]\n"
+    "sdot z28.s, z22.b, z0.b[3]\n"
+    "sdot z29.s, z22.b, z3.b[1]\n"
+    "sdot z27.s, z22.b, z3.b[3]\n"
+    "sdot z26.s, z20.b, z5.b[0]\n"
+    "sdot z25.s, z20.b, z5.b[2]\n"
+    "sdot z24.s, z20.b, z4.b[0]\n"
+    "sdot z23.s, z20.b, z4.b[2]\n"
+    "sdot z26.s, z22.b, z5.b[1]\n"
+    "sdot z25.s, z22.b, z5.b[3]\n"
+    "sdot z24.s, z22.b, z4.b[1]\n"
+    "sdot z23.s, z22.b, z4.b[3]\n"
+    "sdot z18.s, z20.b, z6.b[0]\n"
+    "sdot z17.s, z20.b, z6.b[2]\n"
+    "sdot z16.s, z20.b, z7.b[0]\n"
+    "sdot z21.s, z20.b, z7.b[2]\n"
+    "sdot z18.s, z22.b, z6.b[1]\n"
+    "sdot z17.s, z22.b, z6.b[3]\n"
+    "sdot z16.s, z22.b, z7.b[1]\n"
+    "sdot z21.s, z22.b, z7.b[3]\n"
+    "sdot z19.s, z20.b, z1.b[0]\n"
+    "mov z30.d, z30.d\n"
+    "mov z28.d, z28.d\n"
+    "add z30.s, z30.s, z29.s\n"
+    "sdot z19.s, z22.b, z1.b[1]\n"
+    "add z28.s, z28.s, z27.s\n"
+    "add z30.s, z30.s, z26.s\n"
+    "mov z29.d, z29.d\n"
+    "add z28.s, z28.s, z25.s\n"
+    "add z30.s, z30.s, z24.s\n"
+    "mov z27.d, z27.d\n"
+    "add z28.s, z28.s, z23.s\n"
+    "add z30.s, z30.s, z18.s\n"
+    "add z29.s, z29.s, z26.s\n"
+    "add z28.s, z28.s, z17.s\n"
+    "add z27.s, z27.s, z25.s\n"
+    "add z29.s, z29.s, z24.s\n"
+    "mov z26.d, z26.d\n"
+    "add z27.s, z27.s, z23.s\n"
+    "add z29.s, z29.s, z18.s\n"
+    "mov z25.d, z25.d\n"
+    "add z27.s, z27.s, z17.s\n"
+    "add z29.s, z29.s, z16.s\n"
+    "add z26.s, z26.s, z24.s\n"
+    "add z27.s, z27.s, z21.s\n"
+    "add z25.s, z25.s, z23.s\n"
+    "add z26.s, z26.s, z18.s\n"
+    "mov z24.d, z24.d\n"
+    "add z25.s, z25.s, z17.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "mov z23.d, z23.d\n"
+    "add z25.s, z25.s, z21.s\n"
+    "add z26.s, z26.s, z19.s\n"
+    "add z24.s, z24.s, z18.s\n"
+    "mov z18.s, #0x0\n"
+    "sdot z18.s, z20.b, z1.b[2]\n"
+    "add z23.s, z23.s, z17.s\n"
+    "mov z17.s, #0x0\n"
+    "sdot z17.s, z20.b, z2.b[0]\n"
+    "sdot z18.s, z22.b, z1.b[3]\n"
+    "add z24.s, z24.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "sdot z17.s, z22.b, z2.b[1]\n"
+    "sdot z16.s, z20.b, z2.b[2]\n"
+    "add z25.s, z25.s, z18.s\n"
+    "add z23.s, z23.s, z21.s\n"
+    "add z24.s, z24.s, z19.s\n"
+    "sdot z16.s, z22.b, z2.b[3]\n"
+    "add z23.s, z23.s, z18.s\n"
+    "add z24.s, z24.s, z17.s\n"
+    "neg z15.s, p2/M, z15.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "mul z30.s, p2/M, z30.s, z15.s\n"
+    "st1w { z30.s }, p2, [SP]\n"
+    "add z30.s, z30.s, z13.s\n"
+    "mul z28.s, p2/M, z28.s, z15.s\n"
+    "st1w { z28.s }, p2, [SP, #1, MUL VL]\n"
+    "add z28.s, z28.s, z13.s\n"
+    "mul z29.s, p2/M, z29.s, z15.s\n"
+    "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
+    "add z29.s, z29.s, z13.s\n"
+    "mul z27.s, p2/M, z27.s, z15.s\n"
+    "st1w { z27.s }, p2, [SP, #3, MUL VL]\n"
+    "add z27.s, z27.s, z13.s\n"
+    "mul z26.s, p2/M, z26.s, z15.s\n"
+    "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
+    "add z26.s, z26.s, z13.s\n"
+    "mul z25.s, p2/M, z25.s, z15.s\n"
+    "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
+    "add z25.s, z25.s, z13.s\n"
+    "mul z24.s, p2/M, z24.s, z15.s\n"
+    "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
+    "add z24.s, z24.s, z13.s\n"
+    "mul z23.s, p2/M, z23.s, z15.s\n"
+    "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
+    "add z23.s, z23.s, z13.s\n"
+    "1:"  // Loop
+    "sdot z30.s, z8.b, z0.b[0]\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "incb x28\n"
+    "sdot z28.s, z8.b, z0.b[2]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "whilelt p0.s, x27, %x[n_channels]\n"
+    "sdot z29.s, z8.b, z3.b[0]\n"
+    "whilelt p1.b, x28, x9\n"
+    "sdot z27.s, z8.b, z3.b[2]\n"
+    "sdot z26.s, z8.b, z5.b[0]\n"
+    "sdot z25.s, z8.b, z5.b[2]\n"
+    "sdot z24.s, z8.b, z4.b[0]\n"
+    "sdot z23.s, z8.b, z4.b[2]\n"
+    "ld1b { z8.b }, p2/Z, [%x[params]]\n"
+    "sdot z30.s, z9.b, z0.b[1]\n"
+    "sdot z28.s, z9.b, z0.b[3]\n"
+    "sdot z29.s, z9.b, z3.b[1]\n"
+    "sdot z27.s, z9.b, z3.b[3]\n"
+    "sdot z26.s, z9.b, z5.b[1]\n"
+    "sdot z25.s, z9.b, z5.b[3]\n"
+    "sdot z24.s, z9.b, z4.b[1]\n"
+    "sdot z23.s, z9.b, z4.b[3]\n"
+    "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "sdot z30.s, z10.b, z3.b[0]\n"
+    "sdot z28.s, z10.b, z3.b[2]\n"
+    "sdot z29.s, z10.b, z5.b[0]\n"
+    "sdot z27.s, z10.b, z5.b[2]\n"
+    "sdot z26.s, z10.b, z4.b[0]\n"
+    "sdot z25.s, z10.b, z4.b[2]\n"
+    "sdot z24.s, z10.b, z6.b[0]\n"
+    "sdot z23.s, z10.b, z6.b[2]\n"
+    "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "sdot z30.s, z11.b, z3.b[1]\n"
+    "sdot z28.s, z11.b, z3.b[3]\n"
+    "sdot z29.s, z11.b, z5.b[1]\n"
+    "sdot z27.s, z11.b, z5.b[3]\n"
+    "sdot z26.s, z11.b, z4.b[1]\n"
+    "sdot z25.s, z11.b, z4.b[3]\n"
+    "sdot z24.s, z11.b, z6.b[1]\n"
+    "sdot z23.s, z11.b, z6.b[3]\n"
+    "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z30.s, z8.b, z5.b[0]\n"
+    "sdot z28.s, z8.b, z5.b[2]\n"
+    "sdot z29.s, z8.b, z4.b[0]\n"
+    "sdot z27.s, z8.b, z4.b[2]\n"
+    "sdot z26.s, z8.b, z6.b[0]\n"
+    "sdot z25.s, z8.b, z6.b[2]\n"
+    "sdot z24.s, z8.b, z7.b[0]\n"
+    "sdot z23.s, z8.b, z7.b[2]\n"
+    "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "sdot z30.s, z9.b, z5.b[1]\n"
+    "sdot z28.s, z9.b, z5.b[3]\n"
+    "sdot z29.s, z9.b, z4.b[1]\n"
+    "sdot z27.s, z9.b, z4.b[3]\n"
+    "sdot z26.s, z9.b, z6.b[1]\n"
+    "sdot z25.s, z9.b, z6.b[3]\n"
+    "sdot z24.s, z9.b, z7.b[1]\n"
+    "sdot z23.s, z9.b, z7.b[3]\n"
+    "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "sdot z30.s, z10.b, z4.b[0]\n"
+    "ld1w { z13.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+    "sdot z28.s, z10.b, z4.b[2]\n"
+    "sdot z29.s, z10.b, z6.b[0]\n"
+    "sdot z27.s, z10.b, z6.b[2]\n"
+    "sdot z26.s, z10.b, z7.b[0]\n"
+    "sdot z25.s, z10.b, z7.b[2]\n"
+    "sdot z24.s, z10.b, z1.b[0]\n"
+    "sdot z23.s, z10.b, z1.b[2]\n"
+    "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
+    "sdot z30.s, z11.b, z4.b[1]\n"
+    "sdot z28.s, z11.b, z4.b[3]\n"
+    "sdot z29.s, z11.b, z6.b[1]\n"
+    "sdot z27.s, z11.b, z6.b[3]\n"
+    "sdot z26.s, z11.b, z7.b[1]\n"
+    "sdot z25.s, z11.b, z7.b[3]\n"
+    "sdot z24.s, z11.b, z1.b[1]\n"
+    "sdot z23.s, z11.b, z1.b[3]\n"
+    "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
+    "sdot z30.s, z8.b, z6.b[0]\n"
+    "sdot z28.s, z8.b, z6.b[2]\n"
+    "sdot z29.s, z8.b, z7.b[0]\n"
+    "sdot z27.s, z8.b, z7.b[2]\n"
+    "sdot z26.s, z8.b, z1.b[0]\n"
+    "sdot z25.s, z8.b, z1.b[2]\n"
+    "sdot z24.s, z8.b, z2.b[0]\n"
+    "sdot z23.s, z8.b, z2.b[2]\n"
+    "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
+    "sdot z30.s, z9.b, z6.b[1]\n"
+    "sdot z28.s, z9.b, z6.b[3]\n"
+    "sdot z29.s, z9.b, z7.b[1]\n"
+    "sdot z27.s, z9.b, z7.b[3]\n"
+    "sdot z26.s, z9.b, z1.b[1]\n"
+    "sdot z25.s, z9.b, z1.b[3]\n"
+    "sdot z24.s, z9.b, z2.b[1]\n"
+    "sdot z23.s, z9.b, z2.b[3]\n"
+    "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
+    "addvl %x[params], %x[params], #-3\n"
+    ".inst 0x04b677de  // sqrdmulh z30.s, z30.s, z22.s\n"
+    ".inst 0x04b6779c  // sqrdmulh z28.s, z28.s, z22.s\n"
+    ".inst 0x04b677bd  // sqrdmulh z29.s, z29.s, z22.s\n"
+    ".inst 0x04b6777b  // sqrdmulh z27.s, z27.s, z22.s\n"
+    ".inst 0x04b6775a  // sqrdmulh z26.s, z26.s, z22.s\n"
+    "and z20.d, z30.d, z21.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z17.d, z27.d, z21.d\n"
+    "and z16.d, z26.d, z21.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x04b67739  // sqrdmulh z25.s, z25.s, z22.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sqadd z30.s, z30.s, z20.s\n"
+    ".inst 0x04b67718  // sqrdmulh z24.s, z24.s, z22.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b676f7  // sqrdmulh z23.s, z23.s, z22.s\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    "and z18.d, z25.d, z21.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z27.s, z27.s, z17.s\n"
+    "sqadd z26.s, z26.s, z16.s\n"
+    "and z17.d, z24.d, z21.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "and z16.d, z23.d, z21.d\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    "add z30.s, z30.s, z14.s\n"
+    "sqadd z24.s, z24.s, z17.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "add z28.s, z28.s, z14.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    "smin z30.s, p2/M, z30.s, z12.s\n"
+    "add z29.s, z29.s, z14.s\n"
+    "smin z28.s, p2/M, z28.s, z12.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "smax z30.s, p2/M, z30.s, z31.s\n"
+    "st1b { z30.s }, p0, [x26, x27]\n"
+    "add z27.s, z27.s, z14.s\n"
+    "smax z28.s, p2/M, z28.s, z31.s\n"
+    "ld1w { z30.s }, p2/Z, [SP]\n"
+    "smin z29.s, p2/M, z29.s, z12.s\n"
+    "st1b { z28.s }, p0, [x25, x27]\n"
+    "add z30.s, z30.s, z13.s\n"
+    "smin z27.s, p2/M, z27.s, z12.s\n"
+    "ld1w { z28.s }, p2/Z, [SP, #1, MUL VL]\n"
+    "smax z29.s, p2/M, z29.s, z31.s\n"
+    "st1b { z29.s }, p0, [x24, x27]\n"
+    "add z28.s, z28.s, z13.s\n"
+    "smax z27.s, p2/M, z27.s, z31.s\n"
+    "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    "st1b { z27.s }, p0, [x23, x27]\n"
+    "add z29.s, z29.s, z13.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "ld1w { z27.s }, p2/Z, [SP, #3, MUL VL]\n"
+    "add z26.s, z26.s, z14.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab7  // srshl z23.s, p2/M, z23.s, z21.s\n"
+    "add z25.s, z25.s, z14.s\n"
+    "add z27.s, z27.s, z13.s\n"
+    "add z24.s, z24.s, z14.s\n"
+    "add z23.s, z23.s, z14.s\n"
+    "smin z26.s, p2/M, z26.s, z12.s\n"
+    "smin z25.s, p2/M, z25.s, z12.s\n"
+    "smin z24.s, p2/M, z24.s, z12.s\n"
+    "smin z23.s, p2/M, z23.s, z12.s\n"
+    "smax z26.s, p2/M, z26.s, z31.s\n"
+    "st1b { z26.s }, p0, [x22, x27]\n"
+    "smax z25.s, p2/M, z25.s, z31.s\n"
+    "smax z24.s, p2/M, z24.s, z31.s\n"
+    "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
+    "smax z23.s, p2/M, z23.s, z31.s\n"
+    "st1b { z25.s }, p0, [x21, x27]\n"
+    "add z26.s, z26.s, z13.s\n"
+    "st1b { z24.s }, p0, [x20, x27]\n"
+    "st1b { z23.s }, p0, [x19, x27]\n"
+    "incw x27\n"
+    "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
+    "add z25.s, z25.s, z13.s\n"
+    "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
+    "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
+    "add z24.s, z24.s, z13.s\n"
+    "add z23.s, z23.s, z13.s\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #8\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..4098f6f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef int8_t input_type;
+  typedef int8_t weight_type;
+  typedef int8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size;
+
+  kern_type kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+  sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..3345449
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+  __asm__ __volatile__(
+    "ldp x11, x10, [%x[inptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x10]\n"
+    "addvl SP, SP, #-8\n"
+    "ldp x27, x26, [%x[inptrs], #0x20]\n"
+    "mov x25, #0x0\n"
+    "ldp x24, x23, [%x[inptrs], #0x30]\n"
+    "whilelt p1.b, x25, %x[n_channels]\n"
+    "ldp x22, x21, [%x[outptrs], #0x0]\n"
+    "ldp x20, x19, [%x[outptrs], #0x10]\n"
+    "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "1:"  // Loop
+    "ld1b { z19.b }, p1/Z, [x11, x25]\n"
+    "whilelt p0.s, x25, %x[n_channels]\n"
+    "ld1b { z18.b }, p1/Z, [x10, x25]\n"
+    "ldp x11, x10, [%x[inptrs], #0x40]\n"
+    "ld1b { z16.b }, p1/Z, [x9, x25]\n"
+    "zip1 z21.b, z19.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x28, x25]\n"
+    "zip2 z19.b, z19.b, z16.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x50]\n"
+    "ld1b { z23.b }, p1/Z, [x27, x25]\n"
+    "zip1 z16.b, z18.b, z17.b\n"
+    "ld1b { z20.b }, p1/Z, [x26, x25]\n"
+    "zip2 z18.b, z18.b, z17.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x60]\n"
+    "zip1 z3.b, z21.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x24, x25]\n"
+    "zip2 z2.b, z21.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x23, x25]\n"
+    "zip1 z29.b, z19.b, z18.b\n"
+    "ldp x24, x23, [%x[inptrs], #0x70]\n"
+    "zip2 z28.b, z19.b, z18.b\n"
+    "ld1b { z22.b }, p1/Z, [x11, x25]\n"
+    "zip1 z19.b, z23.b, z17.b\n"
+    "ld1b { z21.b }, p1/Z, [x10, x25]\n"
+    "zip2 z27.b, z23.b, z17.b\n"
+    "ldp x11, x10, [%x[inptrs], #0x0]\n"
+    "zip1 z18.b, z20.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x9, x25]\n"
+    "zip2 z20.b, z20.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x28, x25]\n"
+    "zip1 z1.b, z19.b, z18.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x10]\n"
+    "zip2 z0.b, z19.b, z18.b\n"
+    "ld1b { z19.b }, p1/Z, [x27, x25]\n"
+    "zip1 z26.b, z22.b, z17.b\n"
+    "ld1b { z25.b }, p1/Z, [x26, x25]\n"
+    "zip2 z24.b, z22.b, z17.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x20]\n"
+    "zip1 z23.b, z21.b, z16.b\n"
+    "ld1b { z18.b }, p1/Z, [x24, x25]\n"
+    "zip2 z22.b, z21.b, z16.b\n"
+    "ld1b { z21.b }, p1/Z, [x23, x25]\n"
+    "zip1 z17.b, z27.b, z20.b\n"
+    "ldp x24, x23, [%x[inptrs], #0x30]\n"
+    "zip2 z16.b, z27.b, z20.b\n"
+    "st1b { z29.b }, p2, [SP]\n"
+    "zip1 z20.b, z19.b, z18.b\n"
+    "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
+    "zip2 z19.b, z19.b, z18.b\n"
+    "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
+    "zip1 z18.b, z25.b, z21.b\n"
+    "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
+    "zip2 z17.b, z25.b, z21.b\n"
+    "ld1w { z31.s }, p2/Z, [%x[params]]\n"
+    "zip1 z30.b, z26.b, z23.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "zip2 z28.b, z26.b, z23.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "zip1 z16.b, z24.b, z22.b\n"
+    "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
+    "zip2 z16.b, z24.b, z22.b\n"
+    "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
+    "zip1 z26.b, z20.b, z18.b\n"
+    "ld1b { z25.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "zip2 z24.b, z20.b, z18.b\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "zip1 z16.b, z19.b, z17.b\n"
+    "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
+    "zip2 z16.b, z19.b, z17.b\n"
+    "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
+    "mov z22.d, z31.d\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "mov z20.d, z31.d\n"
+    "mov z19.d, z31.d\n"
+    "sdot z31.s, z29.b, z3.b\n"
+    "sdot z20.s, z29.b, z1.b\n"
+    "ext z3.b, z3.b, z3.b, #0x1\n"
+    "sdot z31.s, z27.b, z1.b\n"
+    "ext z1.b, z1.b, z1.b, #0x1\n"
+    "sdot z20.s, z27.b, z30.b\n"
+    "sdot z22.s, z29.b, z3.b\n"
+    "ld1b { z3.b }, p2/Z, [SP]\n"
+    "sdot z31.s, z25.b, z30.b\n"
+    "ext z30.b, z30.b, z30.b, #0x1\n"
+    "sdot z20.s, z25.b, z26.b\n"
+    "ext z26.b, z26.b, z26.b, #0x1\n"
+    "sdot z19.s, z29.b, z1.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "sdot z22.s, z27.b, z1.b\n"
+    "ld1b { z1.b }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+    ".inst 0x04b77694  // sqrdmulh z20.s, z20.s, z23.s\n"
+    "sdot z19.s, z27.b, z30.b\n"
+    "sdot z22.s, z25.b, z30.b\n"
+    "ld1b { z30.b }, p2/Z, [SP, #4, MUL VL]\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sdot z19.s, z25.b, z26.b\n"
+    "ld1b { z26.b }, p2/Z, [SP, #6, MUL VL]\n"
+    ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+    "and z18.d, z20.d, z21.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x04b77673  // sqrdmulh z19.s, z19.s, z23.s\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    "and z17.d, z22.d, z21.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "and z16.d, z19.d, z21.d\n"
+    "sqadd z20.s, z20.s, z18.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x44828ab4  // srshl z20.s, p2/M, z20.s, z21.s\n"
+    "add z31.s, z31.s, z4.s\n"
+    "sqadd z19.s, z19.s, z16.s\n"
+    "add z20.s, z20.s, z4.s\n"
+    ".inst 0x44828ab6  // srshl z22.s, p2/M, z22.s, z21.s\n"
+    "smax z31.s, p2/M, z31.s, z6.s\n"
+    "smax z20.s, p2/M, z20.s, z6.s\n"
+    ".inst 0x44828ab3  // srshl z19.s, p2/M, z19.s, z21.s\n"
+    "add z22.s, z22.s, z4.s\n"
+    "smin z31.s, p2/M, z31.s, z5.s\n"
+    "st1b { z31.s }, p0, [x22, x25]\n"
+    "add z19.s, z19.s, z4.s\n"
+    "smax z22.s, p2/M, z22.s, z6.s\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "smin z20.s, p2/M, z20.s, z5.s\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z25.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smax z19.s, p2/M, z19.s, z6.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    "smin z22.s, p2/M, z22.s, z5.s\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "smin z19.s, p2/M, z19.s, z5.s\n"
+    "st1b { z20.s }, p0, [x20, x25]\n"
+    "mov z20.d, z31.d\n"
+    "st1b { z22.s }, p0, [x21, x25]\n"
+    "mov z22.d, z31.d\n"
+    "st1b { z19.s }, p0, [x19, x25]\n"
+    "mov z19.d, z31.d\n"
+    "incw x25\n"
+    "sdot z31.s, z29.b, z2.b\n"
+    "whilelt p0.s, x25, %x[n_channels]\n"
+    "sdot z20.s, z29.b, z0.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "sdot z31.s, z27.b, z0.b\n"
+    "sdot z20.s, z27.b, z28.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "sdot z22.s, z29.b, z2.b\n"
+    "ld1b { z2.b }, p2/Z, [SP, #1, MUL VL]\n"
+    "sdot z31.s, z25.b, z28.b\n"
+    "sdot z20.s, z25.b, z24.b\n"
+    "ext z28.b, z28.b, z28.b, #0x1\n"
+    "ext z24.b, z24.b, z24.b, #0x1\n"
+    "sdot z19.s, z29.b, z0.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "sdot z22.s, z27.b, z0.b\n"
+    "ld1b { z0.b }, p2/Z, [SP, #3, MUL VL]\n"
+    ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+    ".inst 0x04b77694  // sqrdmulh z20.s, z20.s, z23.s\n"
+    "sdot z19.s, z27.b, z28.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "sdot z22.s, z25.b, z28.b\n"
+    "ld1b { z28.b }, p2/Z, [SP, #5, MUL VL]\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sdot z19.s, z25.b, z24.b\n"
+    "ld1b { z25.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+    "ld1b { z24.b }, p2/Z, [SP, #7, MUL VL]\n"
+    "and z18.d, z20.d, z21.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x04b77673  // sqrdmulh z19.s, z19.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    "and z17.d, z22.d, z21.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "and z16.d, z19.d, z21.d\n"
+    "sqadd z20.s, z20.s, z18.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x44828ab4  // srshl z20.s, p2/M, z20.s, z21.s\n"
+    "add z31.s, z31.s, z4.s\n"
+    "sqadd z19.s, z19.s, z16.s\n"
+    "add z20.s, z20.s, z4.s\n"
+    ".inst 0x44828ab6  // srshl z22.s, p2/M, z22.s, z21.s\n"
+    "smax z31.s, p2/M, z31.s, z6.s\n"
+    "smax z20.s, p2/M, z20.s, z6.s\n"
+    ".inst 0x44828ab3  // srshl z19.s, p2/M, z19.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "add z22.s, z22.s, z4.s\n"
+    "smin z31.s, p2/M, z31.s, z5.s\n"
+    "st1b { z31.s }, p0, [x22, x25]\n"
+    "add z19.s, z19.s, z4.s\n"
+    "smax z22.s, p2/M, z22.s, z6.s\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "smin z20.s, p2/M, z20.s, z5.s\n"
+    "st1b { z20.s }, p0, [x20, x25]\n"
+    "mov z20.d, z31.d\n"
+    "smin z22.s, p2/M, z22.s, z5.s\n"
+    "st1b { z22.s }, p0, [x21, x25]\n"
+    "mov z22.d, z31.d\n"
+    "sdot z20.s, z29.b, z1.b\n"
+    "smax z19.s, p2/M, z19.s, z6.s\n"
+    "sdot z20.s, z27.b, z30.b\n"
+    "smin z19.s, p2/M, z19.s, z5.s\n"
+    "st1b { z19.s }, p0, [x19, x25]\n"
+    "mov z19.d, z31.d\n"
+    "incw x25\n"
+    "sdot z31.s, z29.b, z3.b\n"
+    "whilelt p0.s, x25, %x[n_channels]\n"
+    "sdot z20.s, z25.b, z26.b\n"
+    "ext z3.b, z3.b, z3.b, #0x1\n"
+    "ext z26.b, z26.b, z26.b, #0x1\n"
+    "sdot z31.s, z27.b, z1.b\n"
+    "ext z1.b, z1.b, z1.b, #0x1\n"
+    "sdot z22.s, z29.b, z3.b\n"
+    ".inst 0x04b77694  // sqrdmulh z20.s, z20.s, z23.s\n"
+    "sdot z31.s, z25.b, z30.b\n"
+    "ext z30.b, z30.b, z30.b, #0x1\n"
+    "sdot z19.s, z29.b, z1.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "sdot z22.s, z27.b, z1.b\n"
+    "and z18.d, z20.d, z21.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sdot z19.s, z27.b, z30.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "sdot z22.s, z25.b, z30.b\n"
+    ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+    "sdot z19.s, z25.b, z26.b\n"
+    "ld1b { z25.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+    "sqadd z20.s, z20.s, z18.s\n"
+    ".inst 0x04b77673  // sqrdmulh z19.s, z19.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z17.d, z22.d, z21.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    "and z16.d, z19.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44828ab4  // srshl z20.s, p2/M, z20.s, z21.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    "add z20.s, z20.s, z4.s\n"
+    "add z31.s, z31.s, z4.s\n"
+    "sqadd z19.s, z19.s, z16.s\n"
+    ".inst 0x44828ab6  // srshl z22.s, p2/M, z22.s, z21.s\n"
+    "smax z20.s, p2/M, z20.s, z6.s\n"
+    "smax z31.s, p2/M, z31.s, z6.s\n"
+    ".inst 0x44828ab3  // srshl z19.s, p2/M, z19.s, z21.s\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "add z22.s, z22.s, z4.s\n"
+    "smin z20.s, p2/M, z20.s, z5.s\n"
+    "st1b { z20.s }, p0, [x20, x25]\n"
+    "add z19.s, z19.s, z4.s\n"
+    "smin z31.s, p2/M, z31.s, z5.s\n"
+    "st1b { z31.s }, p0, [x22, x25]\n"
+    "smax z22.s, p2/M, z22.s, z6.s\n"
+    "smax z19.s, p2/M, z19.s, z6.s\n"
+    "ld1w { z31.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "mov z20.d, z31.d\n"
+    "smin z22.s, p2/M, z22.s, z5.s\n"
+    "st1b { z22.s }, p0, [x21, x25]\n"
+    "mov z22.d, z31.d\n"
+    "sdot z20.s, z29.b, z0.b\n"
+    "smin z19.s, p2/M, z19.s, z5.s\n"
+    "st1b { z19.s }, p0, [x19, x25]\n"
+    "mov z19.d, z31.d\n"
+    "incw x25\n"
+    "sdot z31.s, z29.b, z2.b\n"
+    "whilelt p0.s, x25, %x[n_channels]\n"
+    "sdot z20.s, z27.b, z28.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "sdot z31.s, z27.b, z0.b\n"
+    "sdot z20.s, z25.b, z24.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "ext z24.b, z24.b, z24.b, #0x1\n"
+    "sdot z22.s, z29.b, z2.b\n"
+    "sdot z31.s, z25.b, z28.b\n"
+    "ext z28.b, z28.b, z28.b, #0x1\n"
+    "sdot z19.s, z29.b, z0.b\n"
+    "sdot z22.s, z27.b, z0.b\n"
+    ".inst 0x04b777ff  // sqrdmulh z31.s, z31.s, z23.s\n"
+    ".inst 0x04b77694  // sqrdmulh z20.s, z20.s, z23.s\n"
+    "sdot z19.s, z27.b, z28.b\n"
+    "sdot z22.s, z25.b, z28.b\n"
+    "and z16.d, z31.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sdot z19.s, z25.b, z24.b\n"
+    ".inst 0x04b776d6  // sqrdmulh z22.s, z22.s, z23.s\n"
+    "and z18.d, z20.d, z21.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z17.d, z22.d, z21.d\n"
+    ".inst 0x04b77673  // sqrdmulh z19.s, z19.s, z23.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sqadd z31.s, z31.s, z16.s\n"
+    "and z16.d, z19.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z20.s, z20.s, z18.s\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    "add z31.s, z31.s, z4.s\n"
+    ".inst 0x44828ab4  // srshl z20.s, p2/M, z20.s, z21.s\n"
+    "sqadd z19.s, z19.s, z16.s\n"
+    ".inst 0x44828ab6  // srshl z22.s, p2/M, z22.s, z21.s\n"
+    "smax z31.s, p2/M, z31.s, z6.s\n"
+    "add z20.s, z20.s, z4.s\n"
+    ".inst 0x44828ab3  // srshl z19.s, p2/M, z19.s, z21.s\n"
+    "add z22.s, z22.s, z4.s\n"
+    "smin z31.s, p2/M, z31.s, z5.s\n"
+    "st1b { z31.s }, p0, [x22, x25]\n"
+    "add z19.s, z19.s, z4.s\n"
+    "smax z22.s, p2/M, z22.s, z6.s\n"
+    "smax z20.s, p2/M, z20.s, z6.s\n"
+    "smax z19.s, p2/M, z19.s, z6.s\n"
+    "smin z22.s, p2/M, z22.s, z5.s\n"
+    "st1b { z22.s }, p0, [x21, x25]\n"
+    "smin z20.s, p2/M, z20.s, z5.s\n"
+    "smin z19.s, p2/M, z19.s, z5.s\n"
+    "st1b { z20.s }, p0, [x20, x25]\n"
+    "st1b { z19.s }, p0, [x19, x25]\n"
+    "incw x25\n"
+    "whilelt p1.b, x25, %x[n_channels]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #8\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..72b26a5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+
+struct sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+{
+  typedef uint32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_dot::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_dot::get_packed_size;
+
+  kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+
+  sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..ca6af57
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+{
+  __asm__ __volatile__(
+    "ldp x11, x10, [%x[inptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x10]\n"
+    "addvl SP, SP, #-8\n"
+    "ldp x27, x26, [%x[inptrs], #0x20]\n"
+    "mov x19, #0x1\n"
+    "ldp x25, x24, [%x[inptrs], #0x30]\n"
+    "orr x19, x19, #0x100\n"
+    "ldp x23, x22, [%x[outptrs], #0x0]\n"
+    "orr x19, x19, #0x10000\n"
+    "dup z12.s, w19\n"
+    "ldp x21, x20, [%x[outptrs], #0x10]\n"
+    "mov x19, #0x0\n"
+    "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "whilelt p1.b, x19, %x[n_channels]\n"
+    "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "1:"  // Loop
+    "mov z7.s, #0x0\n"
+    "ld1b { z19.b }, p1/Z, [x11, x19]\n"
+    "whilelt p0.s, x19, %x[n_channels]\n"
+    "mov z6.s, #0x0\n"
+    "ld1b { z18.b }, p1/Z, [x10, x19]\n"
+    "ldp x11, x10, [%x[inptrs], #0x40]\n"
+    "ld1b { z16.b }, p1/Z, [x9, x19]\n"
+    "zip1 z21.b, z19.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x28, x19]\n"
+    "zip2 z19.b, z19.b, z16.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x50]\n"
+    "ld1b { z23.b }, p1/Z, [x27, x19]\n"
+    "zip1 z16.b, z18.b, z17.b\n"
+    "ld1b { z20.b }, p1/Z, [x26, x19]\n"
+    "zip2 z18.b, z18.b, z17.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x60]\n"
+    "zip1 z5.b, z21.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x25, x19]\n"
+    "zip2 z4.b, z21.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x24, x19]\n"
+    "zip1 z29.b, z19.b, z18.b\n"
+    "ldp x25, x24, [%x[inptrs], #0x70]\n"
+    "zip2 z28.b, z19.b, z18.b\n"
+    "ld1b { z22.b }, p1/Z, [x11, x19]\n"
+    "zip1 z19.b, z23.b, z17.b\n"
+    "ld1b { z21.b }, p1/Z, [x10, x19]\n"
+    "zip2 z27.b, z23.b, z17.b\n"
+    "ldp x11, x10, [%x[inptrs], #0x0]\n"
+    "zip1 z18.b, z20.b, z16.b\n"
+    "ld1b { z17.b }, p1/Z, [x9, x19]\n"
+    "zip2 z20.b, z20.b, z16.b\n"
+    "ld1b { z16.b }, p1/Z, [x28, x19]\n"
+    "zip1 z3.b, z19.b, z18.b\n"
+    "ldp x9, x28, [%x[inptrs], #0x10]\n"
+    "zip2 z2.b, z19.b, z18.b\n"
+    "ld1b { z19.b }, p1/Z, [x27, x19]\n"
+    "zip1 z26.b, z22.b, z17.b\n"
+    "ld1b { z25.b }, p1/Z, [x26, x19]\n"
+    "zip2 z24.b, z22.b, z17.b\n"
+    "ldp x27, x26, [%x[inptrs], #0x20]\n"
+    "zip1 z23.b, z21.b, z16.b\n"
+    "ld1b { z18.b }, p1/Z, [x25, x19]\n"
+    "zip2 z22.b, z21.b, z16.b\n"
+    "ld1b { z21.b }, p1/Z, [x24, x19]\n"
+    "zip1 z17.b, z27.b, z20.b\n"
+    "ldp x25, x24, [%x[inptrs], #0x30]\n"
+    "zip2 z16.b, z27.b, z20.b\n"
+    "st1b { z29.b }, p2, [SP]\n"
+    "zip1 z20.b, z19.b, z18.b\n"
+    "st1b { z28.b }, p2, [SP, #1, MUL VL]\n"
+    "zip2 z19.b, z19.b, z18.b\n"
+    "st1b { z17.b }, p2, [SP, #2, MUL VL]\n"
+    "zip1 z18.b, z25.b, z21.b\n"
+    "st1b { z16.b }, p2, [SP, #3, MUL VL]\n"
+    "zip2 z17.b, z25.b, z21.b\n"
+    "ld1w { z1.s }, p2/Z, [%x[params]]\n"
+    "zip1 z0.b, z26.b, z23.b\n"
+    "ld1b { z31.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "zip2 z30.b, z26.b, z23.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "zip1 z16.b, z24.b, z22.b\n"
+    "st1b { z16.b }, p2, [SP, #4, MUL VL]\n"
+    "zip2 z16.b, z24.b, z22.b\n"
+    "st1b { z16.b }, p2, [SP, #5, MUL VL]\n"
+    "zip1 z28.b, z20.b, z18.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "zip2 z26.b, z20.b, z18.b\n"
+    "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "zip1 z16.b, z19.b, z17.b\n"
+    "st1b { z16.b }, p2, [SP, #6, MUL VL]\n"
+    "zip2 z16.b, z19.b, z17.b\n"
+    "st1b { z16.b }, p2, [SP, #7, MUL VL]\n"
+    "mov z24.d, z1.d\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "mov z22.d, z1.d\n"
+    "mov z21.d, z1.d\n"
+    "udot z1.s, z31.b, z5.b\n"
+    "udot z22.s, z31.b, z3.b\n"
+    "udot z7.s, z12.b, z3.b\n"
+    "udot z1.s, z29.b, z3.b\n"
+    "ext z3.b, z3.b, z3.b, #0x1\n"
+    "udot z22.s, z29.b, z0.b\n"
+    "udot z7.s, z12.b, z0.b\n"
+    "udot z1.s, z27.b, z0.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "udot z22.s, z27.b, z28.b\n"
+    "mov z20.d, z7.d\n"
+    "udot z7.s, z12.b, z5.b\n"
+    "udot z20.s, z12.b, z28.b\n"
+    "ext z5.b, z5.b, z5.b, #0x1\n"
+    "ext z28.b, z28.b, z28.b, #0x1\n"
+    "udot z21.s, z31.b, z3.b\n"
+    "udot z6.s, z12.b, z3.b\n"
+    "udot z24.s, z31.b, z5.b\n"
+    "ld1b { z31.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "mls z1.s, p2/M, z7.s, z9.s\n"
+    "udot z21.s, z29.b, z0.b\n"
+    "udot z6.s, z12.b, z0.b\n"
+    "udot z24.s, z29.b, z3.b\n"
+    "ld1b { z3.b }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x04b97421  // sqrdmulh z1.s, z1.s, z25.s\n"
+    "udot z21.s, z27.b, z28.b\n"
+    "mov z19.d, z6.d\n"
+    "udot z24.s, z27.b, z0.b\n"
+    "ld1b { z0.b }, p2/Z, [SP, #4, MUL VL]\n"
+    "udot z6.s, z12.b, z5.b\n"
+    "ld1b { z5.b }, p2/Z, [SP]\n"
+    "udot z19.s, z12.b, z28.b\n"
+    "ld1b { z28.b }, p2/Z, [SP, #6, MUL VL]\n"
+    "and z16.d, z1.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "mov z7.s, #0x0\n"
+    "mls z24.s, p2/M, z6.s, z9.s\n"
+    "udot z7.s, z12.b, z2.b\n"
+    "mov z6.s, #0x0\n"
+    "mls z22.s, p2/M, z20.s, z9.s\n"
+    ".inst 0x04b97718  // sqrdmulh z24.s, z24.s, z25.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    "udot z7.s, z12.b, z30.b\n"
+    ".inst 0x04b976d6  // sqrdmulh z22.s, z22.s, z25.s\n"
+    "and z18.d, z24.d, z23.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z17.d, z22.d, z23.d\n"
+    "mov z20.d, z7.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "udot z7.s, z12.b, z4.b\n"
+    "udot z20.s, z12.b, z26.b\n"
+    "mls z21.s, p2/M, z19.s, z9.s\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    ".inst 0x44828ae1  // srshl z1.s, p2/M, z1.s, z23.s\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    ".inst 0x04b976b5  // sqrdmulh z21.s, z21.s, z25.s\n"
+    ".inst 0x44828af8  // srshl z24.s, p2/M, z24.s, z23.s\n"
+    "add z1.s, z1.s, z8.s\n"
+    "and z16.d, z21.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "add z24.s, z24.s, z8.s\n"
+    "smax z1.s, p2/M, z1.s, z11.s\n"
+    ".inst 0x44828af6  // srshl z22.s, p2/M, z22.s, z23.s\n"
+    "smax z24.s, p2/M, z24.s, z11.s\n"
+    "smin z1.s, p2/M, z1.s, z10.s\n"
+    "st1b { z1.s }, p0, [x23, x19]\n"
+    "add z22.s, z22.s, z8.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "smin z24.s, p2/M, z24.s, z10.s\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #-8, MUL VL]\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+    "smax z22.s, p2/M, z22.s, z11.s\n"
+    "ld1w { z25.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
+    ".inst 0x44828af5  // srshl z21.s, p2/M, z21.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+    "smin z22.s, p2/M, z22.s, z10.s\n"
+    "st1b { z24.s }, p0, [x22, x19]\n"
+    "mov z24.d, z1.d\n"
+    "st1b { z22.s }, p0, [x21, x19]\n"
+    "add z21.s, z21.s, z8.s\n"
+    "mov z22.d, z1.d\n"
+    "udot z22.s, z31.b, z2.b\n"
+    "smax z21.s, p2/M, z21.s, z11.s\n"
+    "udot z22.s, z29.b, z30.b\n"
+    "smin z21.s, p2/M, z21.s, z10.s\n"
+    "st1b { z21.s }, p0, [x20, x19]\n"
+    "mov z21.d, z1.d\n"
+    "incw x19\n"
+    "udot z1.s, z31.b, z4.b\n"
+    "whilelt p0.s, x19, %x[n_channels]\n"
+    "udot z22.s, z27.b, z26.b\n"
+    "ext z4.b, z4.b, z4.b, #0x1\n"
+    "ext z26.b, z26.b, z26.b, #0x1\n"
+    "udot z1.s, z29.b, z2.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "udot z24.s, z31.b, z4.b\n"
+    "mls z22.s, p2/M, z20.s, z9.s\n"
+    "udot z1.s, z27.b, z30.b\n"
+    "ext z30.b, z30.b, z30.b, #0x1\n"
+    "udot z21.s, z31.b, z2.b\n"
+    "ld1b { z31.b }, p2/Z, [%x[params], #-3, MUL VL]\n"
+    "udot z24.s, z29.b, z2.b\n"
+    "udot z6.s, z12.b, z2.b\n"
+    "ld1b { z2.b }, p2/Z, [SP, #3, MUL VL]\n"
+    ".inst 0x04b976d6  // sqrdmulh z22.s, z22.s, z25.s\n"
+    "udot z21.s, z29.b, z30.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #-2, MUL VL]\n"
+    "udot z24.s, z27.b, z30.b\n"
+    "udot z6.s, z12.b, z30.b\n"
+    "ld1b { z30.b }, p2/Z, [SP, #5, MUL VL]\n"
+    "and z17.d, z22.d, z23.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "udot z21.s, z27.b, z26.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+    "mov z19.d, z6.d\n"
+    "udot z6.s, z12.b, z4.b\n"
+    "ld1b { z4.b }, p2/Z, [SP, #1, MUL VL]\n"
+    "udot z19.s, z12.b, z26.b\n"
+    "ld1b { z26.b }, p2/Z, [SP, #7, MUL VL]\n"
+    "mls z1.s, p2/M, z7.s, z9.s\n"
+    "mov z7.s, #0x0\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    "udot z7.s, z12.b, z3.b\n"
+    ".inst 0x04b97421  // sqrdmulh z1.s, z1.s, z25.s\n"
+    "mls z24.s, p2/M, z6.s, z9.s\n"
+    "mov z6.s, #0x0\n"
+    "udot z7.s, z12.b, z0.b\n"
+    "and z16.d, z1.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b97718  // sqrdmulh z24.s, z24.s, z25.s\n"
+    "mov z20.d, z7.d\n"
+    "udot z7.s, z12.b, z5.b\n"
+    "udot z20.s, z12.b, z28.b\n"
+    "mls z21.s, p2/M, z19.s, z9.s\n"
+    "and z18.d, z24.d, z23.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x04b976b5  // sqrdmulh z21.s, z21.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [%x[params]]\n"
+    ".inst 0x44828af6  // srshl z22.s, p2/M, z22.s, z23.s\n"
+    "and z16.d, z21.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    ".inst 0x44828ae1  // srshl z1.s, p2/M, z1.s, z23.s\n"
+    "smax z22.s, p2/M, z22.s, z11.s\n"
+    ".inst 0x44828af8  // srshl z24.s, p2/M, z24.s, z23.s\n"
+    "add z1.s, z1.s, z8.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "smin z22.s, p2/M, z22.s, z10.s\n"
+    "st1b { z22.s }, p0, [x21, x19]\n"
+    "add z24.s, z24.s, z8.s\n"
+    "smax z1.s, p2/M, z1.s, z11.s\n"
+    ".inst 0x44828af5  // srshl z21.s, p2/M, z21.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "smax z24.s, p2/M, z24.s, z11.s\n"
+    "smin z1.s, p2/M, z1.s, z10.s\n"
+    "st1b { z1.s }, p0, [x23, x19]\n"
+    "add z21.s, z21.s, z8.s\n"
+    "smin z24.s, p2/M, z24.s, z10.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+    "smax z21.s, p2/M, z21.s, z11.s\n"
+    "st1b { z24.s }, p0, [x22, x19]\n"
+    "mov z24.d, z1.d\n"
+    "mov z22.d, z1.d\n"
+    "udot z22.s, z31.b, z3.b\n"
+    "smin z21.s, p2/M, z21.s, z10.s\n"
+    "st1b { z21.s }, p0, [x20, x19]\n"
+    "mov z21.d, z1.d\n"
+    "incw x19\n"
+    "udot z1.s, z31.b, z5.b\n"
+    "whilelt p0.s, x19, %x[n_channels]\n"
+    "udot z22.s, z29.b, z0.b\n"
+    "ext z5.b, z5.b, z5.b, #0x1\n"
+    "udot z1.s, z29.b, z3.b\n"
+    "udot z22.s, z27.b, z28.b\n"
+    "ext z3.b, z3.b, z3.b, #0x1\n"
+    "ext z28.b, z28.b, z28.b, #0x1\n"
+    "udot z24.s, z31.b, z5.b\n"
+    "udot z1.s, z27.b, z0.b\n"
+    "ext z0.b, z0.b, z0.b, #0x1\n"
+    "udot z21.s, z31.b, z3.b\n"
+    "ld1b { z31.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "udot z24.s, z29.b, z3.b\n"
+    "udot z6.s, z12.b, z3.b\n"
+    "mls z1.s, p2/M, z7.s, z9.s\n"
+    "udot z21.s, z29.b, z0.b\n"
+    "ld1b { z29.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "udot z24.s, z27.b, z0.b\n"
+    "udot z6.s, z12.b, z0.b\n"
+    ".inst 0x04b97421  // sqrdmulh z1.s, z1.s, z25.s\n"
+    "udot z21.s, z27.b, z28.b\n"
+    "ld1b { z27.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "mov z7.s, #0x0\n"
+    "mov z19.d, z6.d\n"
+    "udot z6.s, z12.b, z5.b\n"
+    "udot z19.s, z12.b, z28.b\n"
+    "and z16.d, z1.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "udot z7.s, z12.b, z2.b\n"
+    "mls z24.s, p2/M, z6.s, z9.s\n"
+    "mov z6.s, #0x0\n"
+    "mls z22.s, p2/M, z20.s, z9.s\n"
+    "mls z21.s, p2/M, z19.s, z9.s\n"
+    ".inst 0x04b97718  // sqrdmulh z24.s, z24.s, z25.s\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x04b976d6  // sqrdmulh z22.s, z22.s, z25.s\n"
+    ".inst 0x04b976b5  // sqrdmulh z21.s, z21.s, z25.s\n"
+    "ld1w { z25.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "and z18.d, z24.d, z23.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z17.d, z22.d, z23.d\n"
+    "and z16.d, z21.d, z23.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "udot z7.s, z12.b, z30.b\n"
+    ".inst 0x44828ae1  // srshl z1.s, p2/M, z1.s, z23.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "add z1.s, z1.s, z8.s\n"
+    "mov z20.d, z7.d\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "udot z7.s, z12.b, z4.b\n"
+    "udot z20.s, z12.b, z26.b\n"
+    "smax z1.s, p2/M, z1.s, z11.s\n"
+    ".inst 0x44828af8  // srshl z24.s, p2/M, z24.s, z23.s\n"
+    ".inst 0x44828af6  // srshl z22.s, p2/M, z22.s, z23.s\n"
+    ".inst 0x44828af5  // srshl z21.s, p2/M, z21.s, z23.s\n"
+    "ld1w { z23.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "smin z1.s, p2/M, z1.s, z10.s\n"
+    "st1b { z1.s }, p0, [x23, x19]\n"
+    "add z24.s, z24.s, z8.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    "ld1w { z1.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "addvl %x[params], %x[params], #8\n"
+    "add z21.s, z21.s, z8.s\n"
+    "smax z24.s, p2/M, z24.s, z11.s\n"
+    "smax z22.s, p2/M, z22.s, z11.s\n"
+    "smax z21.s, p2/M, z21.s, z11.s\n"
+    "smin z24.s, p2/M, z24.s, z10.s\n"
+    "st1b { z24.s }, p0, [x22, x19]\n"
+    "mov z24.d, z1.d\n"
+    "smin z22.s, p2/M, z22.s, z10.s\n"
+    "st1b { z22.s }, p0, [x21, x19]\n"
+    "mov z22.d, z1.d\n"
+    "smin z21.s, p2/M, z21.s, z10.s\n"
+    "st1b { z21.s }, p0, [x20, x19]\n"
+    "mov z21.d, z1.d\n"
+    "incw x19\n"
+    "udot z1.s, z31.b, z4.b\n"
+    "whilelt p0.s, x19, %x[n_channels]\n"
+    "udot z22.s, z31.b, z2.b\n"
+    "ext z4.b, z4.b, z4.b, #0x1\n"
+    "udot z1.s, z29.b, z2.b\n"
+    "udot z22.s, z29.b, z30.b\n"
+    "ext z2.b, z2.b, z2.b, #0x1\n"
+    "udot z24.s, z31.b, z4.b\n"
+    "udot z1.s, z27.b, z30.b\n"
+    "udot z22.s, z27.b, z26.b\n"
+    "ext z30.b, z30.b, z30.b, #0x1\n"
+    "ext z26.b, z26.b, z26.b, #0x1\n"
+    "udot z21.s, z31.b, z2.b\n"
+    "udot z24.s, z29.b, z2.b\n"
+    "udot z6.s, z12.b, z2.b\n"
+    "mls z1.s, p2/M, z7.s, z9.s\n"
+    "udot z21.s, z29.b, z30.b\n"
+    "udot z24.s, z27.b, z30.b\n"
+    "udot z6.s, z12.b, z30.b\n"
+    ".inst 0x04b97421  // sqrdmulh z1.s, z1.s, z25.s\n"
+    "udot z21.s, z27.b, z26.b\n"
+    "mls z22.s, p2/M, z20.s, z9.s\n"
+    "mov z19.d, z6.d\n"
+    "udot z6.s, z12.b, z4.b\n"
+    "udot z19.s, z12.b, z26.b\n"
+    "and z16.d, z1.d, z23.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b976d6  // sqrdmulh z22.s, z22.s, z25.s\n"
+    "mls z24.s, p2/M, z6.s, z9.s\n"
+    "mls z21.s, p2/M, z19.s, z9.s\n"
+    ".inst 0x04b97718  // sqrdmulh z24.s, z24.s, z25.s\n"
+    "and z17.d, z22.d, z23.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sqadd z1.s, z1.s, z16.s\n"
+    ".inst 0x04b976b5  // sqrdmulh z21.s, z21.s, z25.s\n"
+    "and z18.d, z24.d, z23.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "and z16.d, z21.d, z23.d\n"
+    ".inst 0x44828ae1  // srshl z1.s, p2/M, z1.s, z23.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z22.s, z22.s, z17.s\n"
+    "add z1.s, z1.s, z8.s\n"
+    "sqadd z24.s, z24.s, z18.s\n"
+    "smax z1.s, p2/M, z1.s, z11.s\n"
+    ".inst 0x44828af6  // srshl z22.s, p2/M, z22.s, z23.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    ".inst 0x44828af8  // srshl z24.s, p2/M, z24.s, z23.s\n"
+    "add z22.s, z22.s, z8.s\n"
+    "smin z1.s, p2/M, z1.s, z10.s\n"
+    "st1b { z1.s }, p0, [x23, x19]\n"
+    "add z24.s, z24.s, z8.s\n"
+    "smax z22.s, p2/M, z22.s, z11.s\n"
+    ".inst 0x44828af5  // srshl z21.s, p2/M, z21.s, z23.s\n"
+    "smax z24.s, p2/M, z24.s, z11.s\n"
+    "smin z22.s, p2/M, z22.s, z10.s\n"
+    "st1b { z22.s }, p0, [x21, x19]\n"
+    "add z21.s, z21.s, z8.s\n"
+    "smin z24.s, p2/M, z24.s, z10.s\n"
+    "st1b { z24.s }, p0, [x22, x19]\n"
+    "smax z21.s, p2/M, z21.s, z11.s\n"
+    "smin z21.s, p2/M, z21.s, z10.s\n"
+    "st1b { z21.s }, p0, [x20, x19]\n"
+    "incw x19\n"
+    "whilelt p1.b, x19, %x[n_channels]\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #8\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..6174dd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+  sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..2ec7f6e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const uint8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const uint8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x15, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x14, #0x0\n"
+    "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z18.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z15.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z13.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x15, x17\n"
+    "ld1rw { z14.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x15, x17\n"
+    "ldp x10, x9, [x21, #0x0]\n"
+    "mov x19, x15\n"
+    "incw x19\n"
+    "ldp x28, x27, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x17\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z17.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z11.s, z17.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z17.s, z17.s, z16.s\n"
+    "mov z9.d, z11.d\n"
+    "ld1b { z0.h }, p4/Z, [x16]\n"
+    ".inst 0x45521800  // usublb z0.h, z0.b, z18.b\n"
+    "mov z20.d, z17.d\n"
+    "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+    "mov z24.d, z11.d\n"
+    "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+    ".inst 0x45521821  // usublb z1.h, z1.b, z18.b\n"
+    "mov z19.d, z17.d\n"
+    "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+    "mov z26.d, z11.d\n"
+    "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+    ".inst 0x45521842  // usublb z2.h, z2.b, z18.b\n"
+    "mov z23.d, z17.d\n"
+    "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+    ".inst 0x45521863  // usublb z3.h, z3.b, z18.b\n"
+    "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+    "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+    ".inst 0x45521884  // usublb z4.h, z4.b, z18.b\n"
+    "inch x16, ALL, MUL #8\n"
+    "ld1b { z8.h }, p4/Z, [x16]\n"
+    "ldp x23, x22, [x12, #0x0]\n"
+    ".inst 0x455218a5  // usublb z5.h, z5.b, z18.b\n"
+    ".inst 0x455218c6  // usublb z6.h, z6.b, z18.b\n"
+    "ldp x21, x20, [x12, #0x10]\n"
+    ".inst 0x455218e7  // usublb z7.h, z7.b, z18.b\n"
+    ".inst 0x45521908  // usublb z8.h, z8.b, z18.b\n"
+    "ldr x19, [x12, #0x20]\n"
+    "ld1b { z31.h }, p3/Z, [x23, x15]\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    "ld1b { z30.h }, p3/Z, [x22, x15]\n"
+    "ld1b { z29.h }, p3/Z, [x21, x15]\n"
+    ".inst 0x454c1bde  // usublb z30.h, z30.b, z12.b\n"
+    "ld1b { z28.h }, p3/Z, [x20, x15]\n"
+    "ld1b { z27.h }, p3/Z, [x19, x15]\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    ".inst 0x454c1b7b  // usublb z27.h, z27.b, z12.b\n"
+    "1:"  // Loop
+    ".inst 0x448443eb  // smlalb z11.s, p4/M, z31.h, z4.h\n"
+    "ldr x21, [x12, #0x28]\n"
+    "whilelt p0.h, x14, x17\n"
+    ".inst 0x448447f1  // smlalt z17.s, p4/M, z31.h, z4.h\n"
+    "ldr x20, [x12, #0x30]\n"
+    "inch x16\n"
+    ".inst 0x448343e9  // smlalb z9.s, p4/M, z31.h, z3.h\n"
+    "ldr x26, [x12, #0x38]\n"
+    ".inst 0x448347f4  // smlalt z20.s, p4/M, z31.h, z3.h\n"
+    "ldr x25, [x12, #0x40]\n"
+    ".inst 0x448143f8  // smlalb z24.s, p4/M, z31.h, z1.h\n"
+    "ldr x19, [x12, #0x48]\n"
+    ".inst 0x448147f3  // smlalt z19.s, p4/M, z31.h, z1.h\n"
+    "ldr x24, [x12, #0x50]\n"
+    ".inst 0x448043fa  // smlalb z26.s, p4/M, z31.h, z0.h\n"
+    "ldr x23, [x12, #0x58]\n"
+    ".inst 0x448047f7  // smlalt z23.s, p4/M, z31.h, z0.h\n"
+    "ld1b { z31.h }, p3/Z, [x21, x15]\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    ".inst 0x448043cb  // smlalb z11.s, p4/M, z30.h, z0.h\n"
+    "ldr x22, [x12, #0x60]\n"
+    ".inst 0x448047d1  // smlalt z17.s, p4/M, z30.h, z0.h\n"
+    "ld1b { z30.h }, p3/Z, [x19, x15]\n"
+    ".inst 0x454c1bde  // usublb z30.h, z30.b, z12.b\n"
+    ".inst 0x448243a9  // smlalb z9.s, p4/M, z29.h, z2.h\n"
+    "ldr x21, [x12, #0x68]\n"
+    ".inst 0x448247b4  // smlalt z20.s, p4/M, z29.h, z2.h\n"
+    "ld1b { z29.h }, p3/Z, [x20, x15]\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x4485438b  // smlalb z11.s, p4/M, z28.h, z5.h\n"
+    "ldr x20, [x12, #0x70]\n"
+    ".inst 0x44854791  // smlalt z17.s, p4/M, z28.h, z5.h\n"
+    "ldr x19, [x12, #0x78]\n"
+    ".inst 0x44844389  // smlalb z9.s, p4/M, z28.h, z4.h\n"
+    "ld1w { z25.s }, p2/Z, [x13]\n"
+    ".inst 0x44844794  // smlalt z20.s, p4/M, z28.h, z4.h\n"
+    "ld1w { z16.s }, p1/Z, [x13, #1, MUL VL]\n"
+    "addvl x13, x13, #2\n"
+    ".inst 0x44824398  // smlalb z24.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44824793  // smlalt z19.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x4481439a  // smlalb z26.s, p4/M, z28.h, z1.h\n"
+    "uzp1 z10.s, z25.s, z16.s\n"
+    "uzp2 z22.s, z25.s, z16.s\n"
+    "ld1w { z25.s }, p2/Z, [x11]\n"
+    ".inst 0x44814797  // smlalt z23.s, p4/M, z28.h, z1.h\n"
+    "ld1b { z28.h }, p3/Z, [x26, x15]\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    ".inst 0x448643f8  // smlalb z24.s, p4/M, z31.h, z6.h\n"
+    "ld1w { z16.s }, p1/Z, [x11, #1, MUL VL]\n"
+    ".inst 0x448647f3  // smlalt z19.s, p4/M, z31.h, z6.h\n"
+    "ld1b { z31.h }, p3/Z, [x25, x15]\n"
+    "addvl x11, x11, #2\n"
+    ".inst 0x4487436b  // smlalb z11.s, p4/M, z27.h, z7.h\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    "uzp1 z21.s, z25.s, z16.s\n"
+    "uzp2 z25.s, z25.s, z16.s\n"
+    ".inst 0x44874771  // smlalt z17.s, p4/M, z27.h, z7.h\n"
+    ".inst 0x44864369  // smlalb z9.s, p4/M, z27.h, z6.h\n"
+    ".inst 0x44864774  // smlalt z20.s, p4/M, z27.h, z6.h\n"
+    ".inst 0x44844378  // smlalb z24.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x44844773  // smlalt z19.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4483437a  // smlalb z26.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834777  // smlalt z23.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x4481438b  // smlalb z11.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814791  // smlalt z17.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x448843ba  // smlalb z26.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x448847b7  // smlalt z23.s, p4/M, z29.h, z8.h\n"
+    "ld1b { z29.h }, p3/Z, [x24, x15]\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x44804389  // smlalb z9.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x44804794  // smlalt z20.s, p4/M, z28.h, z0.h\n"
+    "ld1b { z28.h }, p3/Z, [x23, x15]\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    ".inst 0x448243eb  // smlalb z11.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448247f1  // smlalt z17.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448143e9  // smlalb z9.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147f4  // smlalt z20.s, p4/M, z31.h, z1.h\n"
+    "ld1b { z31.h }, p3/Z, [x22, x15]\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    ".inst 0x448843cb  // smlalb z11.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x448847d1  // smlalt z17.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x448743c9  // smlalb z9.s, p4/M, z30.h, z7.h\n"
+    ".inst 0x448747d4  // smlalt z20.s, p4/M, z30.h, z7.h\n"
+    ".inst 0x448543d8  // smlalb z24.s, p4/M, z30.h, z5.h\n"
+    ".inst 0x448547d3  // smlalt z19.s, p4/M, z30.h, z5.h\n"
+    ".inst 0x448443da  // smlalb z26.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x448447d7  // smlalt z23.s, p4/M, z30.h, z4.h\n"
+    "ld1b { z30.h }, p3/Z, [x21, x15]\n"
+    ".inst 0x454c1bde  // usublb z30.h, z30.b, z12.b\n"
+    ".inst 0x448343ab  // smlalb z11.s, p4/M, z29.h, z3.h\n"
+    ".inst 0x448347b1  // smlalt z17.s, p4/M, z29.h, z3.h\n"
+    ".inst 0x448043b8  // smlalb z24.s, p4/M, z29.h, z0.h\n"
+    ".inst 0x448047b3  // smlalt z19.s, p4/M, z29.h, z0.h\n"
+    "ld1b { z29.h }, p3/Z, [x20, x15]\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x44854389  // smlalb z9.s, p4/M, z28.h, z5.h\n"
+    ".inst 0x44854794  // smlalt z20.s, p4/M, z28.h, z5.h\n"
+    ".inst 0x4482439a  // smlalb z26.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x44824797  // smlalt z23.s, p4/M, z28.h, z2.h\n"
+    "ld1b { z28.h }, p3/Z, [x19, x15]\n"
+    "inch x15\n"
+    ".inst 0x448643eb  // smlalb z11.s, p4/M, z31.h, z6.h\n"
+    "whilelt p2.s, x15, x17\n"
+    ".inst 0x448647f1  // smlalt z17.s, p4/M, z31.h, z6.h\n"
+    "mov x19, x15\n"
+    ".inst 0x448343f8  // smlalb z24.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    ".inst 0x448347f3  // smlalt z19.s, p4/M, z31.h, z3.h\n"
+    "incw x19\n"
+    ".inst 0x448843c9  // smlalb z9.s, p4/M, z30.h, z8.h\n"
+    "whilelt p1.s, x19, x17\n"
+    ".inst 0x04aa756b  // sqrdmulh z11.s, z11.s, z10.s\n"
+    "whilelt p3.h, x15, x17\n"
+    ".inst 0x04b67631  // sqrdmulh z17.s, z17.s, z22.s\n"
+    ".inst 0x448847d4  // smlalt z20.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x04aa7529  // sqrdmulh z9.s, z9.s, z10.s\n"
+    "and z16.d, z11.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z1.d, z17.d, z25.d\n"
+    "and z27.d, z9.d, z21.d\n"
+    "asr z1.s, z1.s, #0x1f\n"
+    ".inst 0x04b67694  // sqrdmulh z20.s, z20.s, z22.s\n"
+    ".inst 0x448543da  // smlalb z26.s, p4/M, z30.h, z5.h\n"
+    "asr z27.s, z27.s, #0x1f\n"
+    ".inst 0x448547d7  // smlalt z23.s, p4/M, z30.h, z5.h\n"
+    "sqadd z11.s, z11.s, z16.s\n"
+    ".inst 0x448743b8  // smlalb z24.s, p4/M, z29.h, z7.h\n"
+    "and z16.d, z20.d, z25.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z17.s, z17.s, z1.s\n"
+    "sqadd z9.s, z9.s, z27.s\n"
+    ".inst 0x448747b3  // smlalt z19.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x448643ba  // smlalb z26.s, p4/M, z29.h, z6.h\n"
+    ".inst 0x448647b7  // smlalt z23.s, p4/M, z29.h, z6.h\n"
+    ".inst 0x44884398  // smlalb z24.s, p4/M, z28.h, z8.h\n"
+    "sqadd z20.s, z20.s, z16.s\n"
+    ".inst 0x44884793  // smlalt z19.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x4487439a  // smlalb z26.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x04aa7718  // sqrdmulh z24.s, z24.s, z10.s\n"
+    ".inst 0x44874797  // smlalt z23.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x04b67673  // sqrdmulh z19.s, z19.s, z22.s\n"
+    ".inst 0x04aa775a  // sqrdmulh z26.s, z26.s, z10.s\n"
+    "and z16.d, z24.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z7.d, z19.d, z25.d\n"
+    "and z3.d, z26.d, z21.d\n"
+    "asr z7.s, z7.s, #0x1f\n"
+    ".inst 0x04b676f7  // sqrdmulh z23.s, z23.s, z22.s\n"
+    ".inst 0x448292ab  // srshl z11.s, p4/M, z11.s, z21.s\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    ".inst 0x44829331  // srshl z17.s, p4/M, z17.s, z25.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    ".inst 0x448292a9  // srshl z9.s, p4/M, z9.s, z21.s\n"
+    "add z11.s, z11.s, z15.s\n"
+    "add z17.s, z17.s, z15.s\n"
+    "sqadd z19.s, z19.s, z7.s\n"
+    "add z9.s, z9.s, z15.s\n"
+    "sqadd z26.s, z26.s, z3.s\n"
+    "and z16.d, z23.d, z25.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "smin z11.s, p4/M, z11.s, z14.s\n"
+    "smin z17.s, p4/M, z17.s, z14.s\n"
+    "smin z9.s, p4/M, z9.s, z14.s\n"
+    ".inst 0x44829334  // srshl z20.s, p4/M, z20.s, z25.s\n"
+    ".inst 0x448292b8  // srshl z24.s, p4/M, z24.s, z21.s\n"
+    "smax z11.s, p4/M, z11.s, z13.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    "add z20.s, z20.s, z15.s\n"
+    "add z24.s, z24.s, z15.s\n"
+    "smax z17.s, p4/M, z17.s, z13.s\n"
+    "smax z9.s, p4/M, z9.s, z13.s\n"
+    "smin z20.s, p4/M, z20.s, z14.s\n"
+    "smin z24.s, p4/M, z24.s, z14.s\n"
+    "trn1 z11.h, z11.h, z17.h\n"
+    "st1b { z11.h }, p0, [x10, x14]\n"
+    "smax z20.s, p4/M, z20.s, z13.s\n"
+    ".inst 0x44829333  // srshl z19.s, p4/M, z19.s, z25.s\n"
+    "smax z24.s, p4/M, z24.s, z13.s\n"
+    ".inst 0x448292ba  // srshl z26.s, p4/M, z26.s, z21.s\n"
+    ".inst 0x44829337  // srshl z23.s, p4/M, z23.s, z25.s\n"
+    "trn1 z9.h, z9.h, z20.h\n"
+    "st1b { z9.h }, p0, [x9, x14]\n"
+    "add z19.s, z19.s, z15.s\n"
+    "add z26.s, z26.s, z15.s\n"
+    "add z23.s, z23.s, z15.s\n"
+    "smin z19.s, p4/M, z19.s, z14.s\n"
+    "smin z26.s, p4/M, z26.s, z14.s\n"
+    "smin z23.s, p4/M, z23.s, z14.s\n"
+    "smax z19.s, p4/M, z19.s, z13.s\n"
+    "smax z26.s, p4/M, z26.s, z13.s\n"
+    "smax z23.s, p4/M, z23.s, z13.s\n"
+    "trn1 z24.h, z24.h, z19.h\n"
+    "st1b { z24.h }, p0, [x28, x14]\n"
+    "trn1 z26.h, z26.h, z23.h\n"
+    "st1b { z26.h }, p0, [x27, x14]\n"
+    "inch x14\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z17.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z11.s, z17.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z17.s, z17.s, z16.s\n"
+    "mov z9.d, z11.d\n"
+    "ld1b { z0.h }, p4/Z, [x16]\n"
+    ".inst 0x45521800  // usublb z0.h, z0.b, z18.b\n"
+    "mov z20.d, z17.d\n"
+    "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n"
+    "mov z24.d, z11.d\n"
+    "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n"
+    ".inst 0x45521821  // usublb z1.h, z1.b, z18.b\n"
+    "mov z19.d, z17.d\n"
+    "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n"
+    "mov z26.d, z11.d\n"
+    "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n"
+    ".inst 0x45521842  // usublb z2.h, z2.b, z18.b\n"
+    "mov z23.d, z17.d\n"
+    "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n"
+    ".inst 0x45521863  // usublb z3.h, z3.b, z18.b\n"
+    "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n"
+    "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n"
+    ".inst 0x45521884  // usublb z4.h, z4.b, z18.b\n"
+    "inch x16, ALL, MUL #8\n"
+    "ld1b { z8.h }, p4/Z, [x16]\n"
+    "ldp x23, x22, [x12, #0x0]\n"
+    ".inst 0x455218a5  // usublb z5.h, z5.b, z18.b\n"
+    ".inst 0x455218c6  // usublb z6.h, z6.b, z18.b\n"
+    "ldp x21, x20, [x12, #0x10]\n"
+    ".inst 0x455218e7  // usublb z7.h, z7.b, z18.b\n"
+    ".inst 0x45521908  // usublb z8.h, z8.b, z18.b\n"
+    "ldr x19, [x12, #0x20]\n"
+    "ld1b { z31.h }, p3/Z, [x23, x15]\n"
+    ".inst 0x454c1bff  // usublb z31.h, z31.b, z12.b\n"
+    "ld1b { z30.h }, p3/Z, [x22, x15]\n"
+    "ld1b { z29.h }, p3/Z, [x21, x15]\n"
+    ".inst 0x454c1bde  // usublb z30.h, z30.b, z12.b\n"
+    "ld1b { z28.h }, p3/Z, [x20, x15]\n"
+    "ld1b { z27.h }, p3/Z, [x19, x15]\n"
+    ".inst 0x454c1bbd  // usublb z29.h, z29.b, z12.b\n"
+    ".inst 0x454c1b9c  // usublb z28.h, z28.b, z12.b\n"
+    ".inst 0x454c1b7b  // usublb z27.h, z27.b, z12.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..1f470f7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+  sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..bc8f0ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const uint8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const uint8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x5, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x7, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x8, #0x0\n"
+    "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x16, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z19.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z14.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z20.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x7, x5\n"
+    "ld1rw { z15.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x7, x5\n"
+    "ldp x14, x13, [x21, #0x0]\n"
+    "mov x19, x7\n"
+    "incw x19\n"
+    "ldp x12, x11, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x5\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z18.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z13.s, z18.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z16.s, z18.s, z16.s\n"
+    "mov z11.d, z13.d\n"
+    "ld1b { z0.h }, p4/Z, [x6]\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    "mov z9.d, z16.d\n"
+    "ld1b { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
+    "mov z18.d, z13.d\n"
+    "ld1b { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
+    ".inst 0x454c1821  // usublb z1.h, z1.b, z12.b\n"
+    "mov z10.d, z16.d\n"
+    "ld1b { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
+    "mov z22.d, z13.d\n"
+    "ld1b { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+    ".inst 0x454c1842  // usublb z2.h, z2.b, z12.b\n"
+    "mov z23.d, z16.d\n"
+    "ld1b { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    "ld1b { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
+    "ld1b { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    "inch x6, ALL, MUL #8\n"
+    "ld1b { z8.h }, p4/Z, [x6]\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    ".inst 0x454c18a5  // usublb z5.h, z5.b, z12.b\n"
+    ".inst 0x454c18c6  // usublb z6.h, z6.b, z12.b\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    ".inst 0x454c18e7  // usublb z7.h, z7.b, z12.b\n"
+    ".inst 0x454c1908  // usublb z8.h, z8.b, z12.b\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "ld1b { z31.h }, p3/Z, [x26, x7]\n"
+    ".inst 0x45531bff  // usublb z31.h, z31.b, z19.b\n"
+    "ld1b { z30.h }, p3/Z, [x25, x7]\n"
+    "ld1b { z29.h }, p3/Z, [x24, x7]\n"
+    ".inst 0x45531bde  // usublb z30.h, z30.b, z19.b\n"
+    "ld1b { z28.h }, p3/Z, [x23, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x45531bbd  // usublb z29.h, z29.b, z19.b\n"
+    "ld1b { z26.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x45531b9c  // usublb z28.h, z28.b, z19.b\n"
+    "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+    "ld1b { z24.h }, p3/Z, [x19, x7]\n"
+    ".inst 0x45531b7b  // usublb z27.h, z27.b, z19.b\n"
+    ".inst 0x45531b5a  // usublb z26.h, z26.b, z19.b\n"
+    ".inst 0x45531b39  // usublb z25.h, z25.b, z19.b\n"
+    ".inst 0x45531b18  // usublb z24.h, z24.b, z19.b\n"
+    "1:"  // Loop
+    ".inst 0x448843ed  // smlalb z13.s, p4/M, z31.h, z8.h\n"
+    "ldr x23, [x16, #0x40]\n"
+    "whilelt p0.h, x8, x5\n"
+    ".inst 0x448847f0  // smlalt z16.s, p4/M, z31.h, z8.h\n"
+    "ldr x22, [x16, #0x48]\n"
+    "inch x6\n"
+    ".inst 0x448643eb  // smlalb z11.s, p4/M, z31.h, z6.h\n"
+    "ldr x21, [x16, #0x50]\n"
+    ".inst 0x448647e9  // smlalt z9.s, p4/M, z31.h, z6.h\n"
+    "ldr x20, [x16, #0x58]\n"
+    ".inst 0x448243f2  // smlalb z18.s, p4/M, z31.h, z2.h\n"
+    "ldr x19, [x16, #0x60]\n"
+    ".inst 0x448247ea  // smlalt z10.s, p4/M, z31.h, z2.h\n"
+    "ldr x10, [x16, #0x68]\n"
+    ".inst 0x448043f6  // smlalb z22.s, p4/M, z31.h, z0.h\n"
+    "ldr x9, [x16, #0x70]\n"
+    ".inst 0x448047f7  // smlalt z23.s, p4/M, z31.h, z0.h\n"
+    "ldr x28, [x16, #0x78]\n"
+    ".inst 0x448043cd  // smlalb z13.s, p4/M, z30.h, z0.h\n"
+    "ldr x27, [x16, #0x80]\n"
+    ".inst 0x448047d0  // smlalt z16.s, p4/M, z30.h, z0.h\n"
+    "ldr x26, [x16, #0x88]\n"
+    ".inst 0x4481438b  // smlalb z11.s, p4/M, z28.h, z1.h\n"
+    "ldr x25, [x16, #0x90]\n"
+    ".inst 0x44814789  // smlalt z9.s, p4/M, z28.h, z1.h\n"
+    "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x45531b9c  // usublb z28.h, z28.b, z19.b\n"
+    ".inst 0x448143ad  // smlalb z13.s, p4/M, z29.h, z1.h\n"
+    "ldr x24, [x16, #0x98]\n"
+    ".inst 0x448147b0  // smlalt z16.s, p4/M, z29.h, z1.h\n"
+    "ld1b { z29.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x45531bbd  // usublb z29.h, z29.b, z19.b\n"
+    ".inst 0x4482436b  // smlalb z11.s, p4/M, z27.h, z2.h\n"
+    "ldr x23, [x16, #0xa0]\n"
+    ".inst 0x44824769  // smlalt z9.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z27.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x45531b7b  // usublb z27.h, z27.b, z19.b\n"
+    ".inst 0x4483434d  // smlalb z13.s, p4/M, z26.h, z3.h\n"
+    "ldr x22, [x16, #0xa8]\n"
+    ".inst 0x44834750  // smlalt z16.s, p4/M, z26.h, z3.h\n"
+    "ld1b { z26.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x45531b5a  // usublb z26.h, z26.b, z19.b\n"
+    ".inst 0x4484432d  // smlalb z13.s, p4/M, z25.h, z4.h\n"
+    "ldr x21, [x16, #0xb0]\n"
+    ".inst 0x44844730  // smlalt z16.s, p4/M, z25.h, z4.h\n"
+    "ld1b { z25.h }, p3/Z, [x19, x7]\n"
+    ".inst 0x45531b39  // usublb z25.h, z25.b, z19.b\n"
+    ".inst 0x4482430d  // smlalb z13.s, p4/M, z24.h, z2.h\n"
+    "ldr x20, [x16, #0xb8]\n"
+    ".inst 0x44824710  // smlalt z16.s, p4/M, z24.h, z2.h\n"
+    "ldr x19, [x16, #0xc0]\n"
+    ".inst 0x4480430b  // smlalb z11.s, p4/M, z24.h, z0.h\n"
+    "ld1w { z21.s }, p2/Z, [x17]\n"
+    ".inst 0x44804709  // smlalt z9.s, p4/M, z24.h, z0.h\n"
+    "ld1b { z24.h }, p3/Z, [x9, x7]\n"
+    ".inst 0x45531b18  // usublb z24.h, z24.b, z19.b\n"
+    ".inst 0x448443ab  // smlalb z11.s, p4/M, z29.h, z4.h\n"
+    "ld1w { z17.s }, p1/Z, [x17, #1, MUL VL]\n"
+    ".inst 0x448447a9  // smlalt z9.s, p4/M, z29.h, z4.h\n"
+    "ld1b { z29.h }, p3/Z, [x10, x7]\n"
+    "addvl x17, x17, #2\n"
+    ".inst 0x4485436d  // smlalb z13.s, p4/M, z27.h, z5.h\n"
+    ".inst 0x45531bbd  // usublb z29.h, z29.b, z19.b\n"
+    "uzp1 z30.s, z21.s, z17.s\n"
+    "uzp2 z31.s, z21.s, z17.s\n"
+    "ld1w { z21.s }, p2/Z, [x15]\n"
+    ".inst 0x4485438b  // smlalb z11.s, p4/M, z28.h, z5.h\n"
+    "ld1w { z17.s }, p1/Z, [x15, #1, MUL VL]\n"
+    "addvl x15, x15, #2\n"
+    ".inst 0x44854789  // smlalt z9.s, p4/M, z28.h, z5.h\n"
+    "ld1b { z28.h }, p3/Z, [x27, x7]\n"
+    ".inst 0x45531b9c  // usublb z28.h, z28.b, z19.b\n"
+    ".inst 0x44854770  // smlalt z16.s, p4/M, z27.h, z5.h\n"
+    ".inst 0x4483436b  // smlalb z11.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834769  // smlalt z9.s, p4/M, z27.h, z3.h\n"
+    "ld1b { z27.h }, p3/Z, [x28, x7]\n"
+    ".inst 0x45531b7b  // usublb z27.h, z27.b, z19.b\n"
+    ".inst 0x44834352  // smlalb z18.s, p4/M, z26.h, z3.h\n"
+    ".inst 0x4483474a  // smlalt z10.s, p4/M, z26.h, z3.h\n"
+    "ld1b { z26.h }, p3/Z, [x26, x7]\n"
+    ".inst 0x45531b5a  // usublb z26.h, z26.b, z19.b\n"
+    ".inst 0x4486432d  // smlalb z13.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x44864730  // smlalt z16.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x44804332  // smlalb z18.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x4480472a  // smlalt z10.s, p4/M, z25.h, z0.h\n"
+    "ld1b { z25.h }, p3/Z, [x25, x7]\n"
+    ".inst 0x45531b39  // usublb z25.h, z25.b, z19.b\n"
+    "uzp1 z0.s, z21.s, z17.s\n"
+    "uzp2 z21.s, z21.s, z17.s\n"
+    ".inst 0x448443b2  // smlalb z18.s, p4/M, z29.h, z4.h\n"
+    ".inst 0x448447aa  // smlalt z10.s, p4/M, z29.h, z4.h\n"
+    "ld1b { z29.h }, p3/Z, [x24, x7]\n"
+    ".inst 0x45531bbd  // usublb z29.h, z29.b, z19.b\n"
+    ".inst 0x4487430d  // smlalb z13.s, p4/M, z24.h, z7.h\n"
+    ".inst 0x44874710  // smlalt z16.s, p4/M, z24.h, z7.h\n"
+    ".inst 0x44814312  // smlalb z18.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x4481470a  // smlalt z10.s, p4/M, z24.h, z1.h\n"
+    "ld1b { z24.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x45531b18  // usublb z24.h, z24.b, z19.b\n"
+    ".inst 0x04be75ad  // sqrdmulh z13.s, z13.s, z30.s\n"
+    ".inst 0x04bf7610  // sqrdmulh z16.s, z16.s, z31.s\n"
+    ".inst 0x44844376  // smlalb z22.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x44844777  // smlalt z23.s, p4/M, z27.h, z4.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+    ".inst 0x45531b7b  // usublb z27.h, z27.b, z19.b\n"
+    "and z4.d, z13.d, z0.d\n"
+    "and z17.d, z16.d, z21.d\n"
+    "asr z4.s, z4.s, #0x1f\n"
+    ".inst 0x4487438b  // smlalb z11.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x44874789  // smlalt z9.s, p4/M, z28.h, z7.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x44814396  // smlalb z22.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814797  // smlalt z23.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44864332  // smlalb z18.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x4486472a  // smlalt z10.s, p4/M, z25.h, z6.h\n"
+    "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+    ".inst 0x45531b39  // usublb z25.h, z25.b, z19.b\n"
+    "sqadd z13.s, z13.s, z4.s\n"
+    "sqadd z16.s, z16.s, z17.s\n"
+    ".inst 0x44854356  // smlalb z22.s, p4/M, z26.h, z5.h\n"
+    ".inst 0x44854757  // smlalt z23.s, p4/M, z26.h, z5.h\n"
+    "ld1b { z26.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x45531b5a  // usublb z26.h, z26.b, z19.b\n"
+    ".inst 0x448843ab  // smlalb z11.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x448847a9  // smlalt z9.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x448243b6  // smlalb z22.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x448247b7  // smlalt z23.s, p4/M, z29.h, z2.h\n"
+    "ld1b { z29.h }, p3/Z, [x19, x7]\n"
+    "inch x7\n"
+    ".inst 0x04be756b  // sqrdmulh z11.s, z11.s, z30.s\n"
+    "whilelt p2.s, x7, x5\n"
+    ".inst 0x04bf7529  // sqrdmulh z9.s, z9.s, z31.s\n"
+    "mov x19, x7\n"
+    ".inst 0x44874372  // smlalb z18.s, p4/M, z27.h, z7.h\n"
+    ".inst 0x45531bbd  // usublb z29.h, z29.b, z19.b\n"
+    ".inst 0x4487476a  // smlalt z10.s, p4/M, z27.h, z7.h\n"
+    "incw x19\n"
+    ".inst 0x44834316  // smlalb z22.s, p4/M, z24.h, z3.h\n"
+    "whilelt p1.s, x19, x5\n"
+    "and z1.d, z11.d, z0.d\n"
+    "whilelt p3.h, x7, x5\n"
+    "and z17.d, z9.d, z21.d\n"
+    "asr z1.s, z1.s, #0x1f\n"
+    ".inst 0x44854312  // smlalb z18.s, p4/M, z24.h, z5.h\n"
+    ".inst 0x4485470a  // smlalt z10.s, p4/M, z24.h, z5.h\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x44834717  // smlalt z23.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44874356  // smlalb z22.s, p4/M, z26.h, z7.h\n"
+    ".inst 0x4482900d  // srshl z13.s, p4/M, z13.s, z0.s\n"
+    ".inst 0x44884332  // smlalb z18.s, p4/M, z25.h, z8.h\n"
+    "sqadd z11.s, z11.s, z1.s\n"
+    "sqadd z9.s, z9.s, z17.s\n"
+    "add z13.s, z13.s, z14.s\n"
+    ".inst 0x04be7652  // sqrdmulh z18.s, z18.s, z30.s\n"
+    ".inst 0x44874757  // smlalt z23.s, p4/M, z26.h, z7.h\n"
+    ".inst 0x4488472a  // smlalt z10.s, p4/M, z25.h, z8.h\n"
+    ".inst 0x44864336  // smlalb z22.s, p4/M, z25.h, z6.h\n"
+    "and z17.d, z18.d, z0.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    ".inst 0x04bf754a  // sqrdmulh z10.s, z10.s, z31.s\n"
+    ".inst 0x44864737  // smlalt z23.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x448843b6  // smlalb z22.s, p4/M, z29.h, z8.h\n"
+    "smin z13.s, p4/M, z13.s, z15.s\n"
+    ".inst 0x448292b0  // srshl z16.s, p4/M, z16.s, z21.s\n"
+    "and z1.d, z10.d, z21.d\n"
+    "asr z1.s, z1.s, #0x1f\n"
+    "add z16.s, z16.s, z14.s\n"
+    "sqadd z18.s, z18.s, z17.s\n"
+    ".inst 0x04be76d6  // sqrdmulh z22.s, z22.s, z30.s\n"
+    ".inst 0x448847b7  // smlalt z23.s, p4/M, z29.h, z8.h\n"
+    "smax z13.s, p4/M, z13.s, z20.s\n"
+    "smin z16.s, p4/M, z16.s, z15.s\n"
+    "sqadd z10.s, z10.s, z1.s\n"
+    "and z2.d, z22.d, z0.d\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x04bf76f7  // sqrdmulh z23.s, z23.s, z31.s\n"
+    "smax z16.s, p4/M, z16.s, z20.s\n"
+    ".inst 0x4482900b  // srshl z11.s, p4/M, z11.s, z0.s\n"
+    ".inst 0x448292a9  // srshl z9.s, p4/M, z9.s, z21.s\n"
+    ".inst 0x44829012  // srshl z18.s, p4/M, z18.s, z0.s\n"
+    "trn1 z13.h, z13.h, z16.h\n"
+    "st1b { z13.h }, p0, [x14, x8]\n"
+    "add z11.s, z11.s, z14.s\n"
+    "add z9.s, z9.s, z14.s\n"
+    "add z18.s, z18.s, z14.s\n"
+    "sqadd z22.s, z22.s, z2.s\n"
+    "and z16.d, z23.d, z21.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "smin z11.s, p4/M, z11.s, z15.s\n"
+    "smin z9.s, p4/M, z9.s, z15.s\n"
+    "smin z18.s, p4/M, z18.s, z15.s\n"
+    ".inst 0x448292aa  // srshl z10.s, p4/M, z10.s, z21.s\n"
+    ".inst 0x44829016  // srshl z22.s, p4/M, z22.s, z0.s\n"
+    "smax z11.s, p4/M, z11.s, z20.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    "add z10.s, z10.s, z14.s\n"
+    "add z22.s, z22.s, z14.s\n"
+    "smax z9.s, p4/M, z9.s, z20.s\n"
+    "smax z18.s, p4/M, z18.s, z20.s\n"
+    "smin z10.s, p4/M, z10.s, z15.s\n"
+    "smin z22.s, p4/M, z22.s, z15.s\n"
+    "trn1 z11.h, z11.h, z9.h\n"
+    "st1b { z11.h }, p0, [x13, x8]\n"
+    "smax z10.s, p4/M, z10.s, z20.s\n"
+    ".inst 0x448292b7  // srshl z23.s, p4/M, z23.s, z21.s\n"
+    "smax z22.s, p4/M, z22.s, z20.s\n"
+    "trn1 z18.h, z18.h, z10.h\n"
+    "st1b { z18.h }, p0, [x12, x8]\n"
+    "add z23.s, z23.s, z14.s\n"
+    "smin z23.s, p4/M, z23.s, z15.s\n"
+    "smax z23.s, p4/M, z23.s, z20.s\n"
+    "trn1 z22.h, z22.h, z23.h\n"
+    "st1b { z22.h }, p0, [x11, x8]\n"
+    "inch x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z18.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z13.s, z18.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z16.s, z18.s, z16.s\n"
+    "mov z11.d, z13.d\n"
+    "ld1b { z0.h }, p4/Z, [x6]\n"
+    ".inst 0x454c1800  // usublb z0.h, z0.b, z12.b\n"
+    "mov z9.d, z16.d\n"
+    "ld1b { z1.h }, p4/Z, [x6, #1, MUL VL]\n"
+    "mov z18.d, z13.d\n"
+    "ld1b { z2.h }, p4/Z, [x6, #2, MUL VL]\n"
+    ".inst 0x454c1821  // usublb z1.h, z1.b, z12.b\n"
+    "mov z10.d, z16.d\n"
+    "ld1b { z3.h }, p4/Z, [x6, #3, MUL VL]\n"
+    "mov z22.d, z13.d\n"
+    "ld1b { z4.h }, p4/Z, [x6, #4, MUL VL]\n"
+    ".inst 0x454c1842  // usublb z2.h, z2.b, z12.b\n"
+    "mov z23.d, z16.d\n"
+    "ld1b { z5.h }, p4/Z, [x6, #5, MUL VL]\n"
+    ".inst 0x454c1863  // usublb z3.h, z3.b, z12.b\n"
+    "ld1b { z6.h }, p4/Z, [x6, #6, MUL VL]\n"
+    "ld1b { z7.h }, p4/Z, [x6, #7, MUL VL]\n"
+    ".inst 0x454c1884  // usublb z4.h, z4.b, z12.b\n"
+    "inch x6, ALL, MUL #8\n"
+    "ld1b { z8.h }, p4/Z, [x6]\n"
+    "ldp x26, x25, [x16, #0x0]\n"
+    ".inst 0x454c18a5  // usublb z5.h, z5.b, z12.b\n"
+    ".inst 0x454c18c6  // usublb z6.h, z6.b, z12.b\n"
+    "ldp x24, x23, [x16, #0x10]\n"
+    ".inst 0x454c18e7  // usublb z7.h, z7.b, z12.b\n"
+    ".inst 0x454c1908  // usublb z8.h, z8.b, z12.b\n"
+    "ldp x22, x21, [x16, #0x20]\n"
+    "ldp x20, x19, [x16, #0x30]\n"
+    "ld1b { z31.h }, p3/Z, [x26, x7]\n"
+    ".inst 0x45531bff  // usublb z31.h, z31.b, z19.b\n"
+    "ld1b { z30.h }, p3/Z, [x25, x7]\n"
+    "ld1b { z29.h }, p3/Z, [x24, x7]\n"
+    ".inst 0x45531bde  // usublb z30.h, z30.b, z19.b\n"
+    "ld1b { z28.h }, p3/Z, [x23, x7]\n"
+    "ld1b { z27.h }, p3/Z, [x22, x7]\n"
+    ".inst 0x45531bbd  // usublb z29.h, z29.b, z19.b\n"
+    "ld1b { z26.h }, p3/Z, [x21, x7]\n"
+    ".inst 0x45531b9c  // usublb z28.h, z28.b, z19.b\n"
+    "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+    "ld1b { z24.h }, p3/Z, [x19, x7]\n"
+    ".inst 0x45531b7b  // usublb z27.h, z27.b, z19.b\n"
+    ".inst 0x45531b5a  // usublb z26.h, z26.b, z19.b\n"
+    ".inst 0x45531b39  // usublb z25.h, z25.b, z19.b\n"
+    ".inst 0x45531b18  // usublb z24.h, z24.b, z19.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..f025b08
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_5x5_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_5x5_mla::get_packed_size;
+
+  kern_type kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+  sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..9542318
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const uint8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const uint8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const uint8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x2, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x3, #0x0\n"
+    "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z17.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z13.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z14.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z5.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x2, x0\n"
+    "ld1rw { z15.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x2, x0\n"
+    "ldp x7, x8, [x21, #0x0]\n"
+    "mov x19, x2\n"
+    "incw x19\n"
+    "ldp x17, x16, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x0\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z19.s }, p2/Z, [x19]\n"
+    "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z11.s, z19.s, z6.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z16.s, z19.s, z6.s\n"
+    "mov z19.d, z11.d\n"
+    "ld1b { z0.h }, p4/Z, [x1]\n"
+    ".inst 0x454d1800  // usublb z0.h, z0.b, z13.b\n"
+    "mov z9.d, z16.d\n"
+    "ld1b { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+    "mov z7.d, z11.d\n"
+    "ld1b { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x454d1821  // usublb z1.h, z1.b, z13.b\n"
+    "mov z6.d, z16.d\n"
+    "ld1b { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+    "mov z12.d, z11.d\n"
+    "ld1b { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    "mov z8.d, z16.d\n"
+    "ldp x28, x27, [x5, #0x0]\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    "ldp x26, x25, [x5, #0x10]\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    "ldp x24, x23, [x5, #0x20]\n"
+    "ldp x22, x21, [x5, #0x30]\n"
+    "ldp x20, x19, [x5, #0x40]\n"
+    "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x45511bff  // usublb z31.h, z31.b, z17.b\n"
+    "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x45511bde  // usublb z30.h, z30.b, z17.b\n"
+    "ld1b { z28.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z27.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x45511bbd  // usublb z29.h, z29.b, z17.b\n"
+    "ld1b { z23.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x45511b9c  // usublb z28.h, z28.b, z17.b\n"
+    "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+    "ld1b { z24.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x45511b7b  // usublb z27.h, z27.b, z17.b\n"
+    "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x45511af7  // usublb z23.h, z23.b, z17.b\n"
+    "ld1b { z22.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x45511b39  // usublb z25.h, z25.b, z17.b\n"
+    ".inst 0x45511b18  // usublb z24.h, z24.b, z17.b\n"
+    ".inst 0x45511b5a  // usublb z26.h, z26.b, z17.b\n"
+    ".inst 0x45511ad6  // usublb z22.h, z22.b, z17.b\n"
+    "1:"  // Loop
+    ".inst 0x448043eb  // smlalb z11.s, p4/M, z31.h, z0.h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "whilelt p0.h, x3, x0\n"
+    ".inst 0x448047f0  // smlalt z16.s, p4/M, z31.h, z0.h\n"
+    "ldr x19, [x5, #0x58]\n"
+    ".inst 0x448043d3  // smlalb z19.s, p4/M, z30.h, z0.h\n"
+    "ldr x25, [x5, #0x60]\n"
+    ".inst 0x448047c9  // smlalt z9.s, p4/M, z30.h, z0.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x45511bff  // usublb z31.h, z31.b, z17.b\n"
+    ".inst 0x448043a7  // smlalb z7.s, p4/M, z29.h, z0.h\n"
+    "ldr x24, [x5, #0x68]\n"
+    ".inst 0x448047a6  // smlalt z6.s, p4/M, z29.h, z0.h\n"
+    "ldr x23, [x5, #0x70]\n"
+    ".inst 0x4480438c  // smlalb z12.s, p4/M, z28.h, z0.h\n"
+    "ldr x22, [x5, #0x78]\n"
+    ".inst 0x44804788  // smlalt z8.s, p4/M, z28.h, z0.h\n"
+    "ld1b { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x454d1800  // usublb z0.h, z0.b, z13.b\n"
+    ".inst 0x448143cb  // smlalb z11.s, p4/M, z30.h, z1.h\n"
+    "ldr x15, [x5, #0x80]\n"
+    ".inst 0x448147d0  // smlalt z16.s, p4/M, z30.h, z1.h\n"
+    "ld1b { z30.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x45511bde  // usublb z30.h, z30.b, z17.b\n"
+    ".inst 0x44814373  // smlalb z19.s, p4/M, z27.h, z1.h\n"
+    "ldr x21, [x5, #0x88]\n"
+    ".inst 0x44814769  // smlalt z9.s, p4/M, z27.h, z1.h\n"
+    "ldr x20, [x5, #0x90]\n"
+    ".inst 0x44814387  // smlalb z7.s, p4/M, z28.h, z1.h\n"
+    "ldr x19, [x5, #0x98]\n"
+    ".inst 0x44814786  // smlalt z6.s, p4/M, z28.h, z1.h\n"
+    "ldr x14, [x5, #0xa0]\n"
+    ".inst 0x448142ec  // smlalb z12.s, p4/M, z23.h, z1.h\n"
+    "ldr x13, [x5, #0xa8]\n"
+    ".inst 0x448146e8  // smlalt z8.s, p4/M, z23.h, z1.h\n"
+    "ld1b { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x454d1821  // usublb z1.h, z1.b, z13.b\n"
+    ".inst 0x4482436b  // smlalb z11.s, p4/M, z27.h, z2.h\n"
+    "ldr x12, [x5, #0xb0]\n"
+    ".inst 0x44824770  // smlalt z16.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z27.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x45511b7b  // usublb z27.h, z27.b, z17.b\n"
+    ".inst 0x44824333  // smlalb z19.s, p4/M, z25.h, z2.h\n"
+    "ldr x11, [x5, #0xb8]\n"
+    ".inst 0x44824729  // smlalt z9.s, p4/M, z25.h, z2.h\n"
+    "ldr x10, [x5, #0xc0]\n"
+    ".inst 0x448242e7  // smlalb z7.s, p4/M, z23.h, z2.h\n"
+    "ldr x9, [x5, #0xc8]\n"
+    ".inst 0x448246e6  // smlalt z6.s, p4/M, z23.h, z2.h\n"
+    "ldr x28, [x5, #0xd0]\n"
+    ".inst 0x448243ec  // smlalb z12.s, p4/M, z31.h, z2.h\n"
+    "ldr x27, [x5, #0xd8]\n"
+    ".inst 0x448247e8  // smlalt z8.s, p4/M, z31.h, z2.h\n"
+    "ld1b { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    ".inst 0x4483432b  // smlalb z11.s, p4/M, z25.h, z3.h\n"
+    "ldr x26, [x5, #0xe0]\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    ".inst 0x44834730  // smlalt z16.s, p4/M, z25.h, z3.h\n"
+    "ld1b { z25.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x44834313  // smlalb z19.s, p4/M, z24.h, z3.h\n"
+    "ldr x25, [x5, #0xe8]\n"
+    ".inst 0x45511b39  // usublb z25.h, z25.b, z17.b\n"
+    ".inst 0x44834709  // smlalt z9.s, p4/M, z24.h, z3.h\n"
+    "ld1w { z18.s }, p2/Z, [x4]\n"
+    ".inst 0x448343e7  // smlalb z7.s, p4/M, z31.h, z3.h\n"
+    "ld1w { z20.s }, p1/Z, [x4, #1, MUL VL]\n"
+    "addvl x4, x4, #2\n"
+    ".inst 0x448347e6  // smlalt z6.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x448343cc  // smlalb z12.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x448347c8  // smlalt z8.s, p4/M, z30.h, z3.h\n"
+    "ld1b { z3.h }, p4/Z, [x1]\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    "uzp1 z21.s, z18.s, z20.s\n"
+    "uzp2 z10.s, z18.s, z20.s\n"
+    "ld1w { z18.s }, p2/Z, [x6]\n"
+    ".inst 0x4484430b  // smlalb z11.s, p4/M, z24.h, z4.h\n"
+    "ld1w { z20.s }, p1/Z, [x6, #1, MUL VL]\n"
+    "addvl x6, x6, #2\n"
+    ".inst 0x44844710  // smlalt z16.s, p4/M, z24.h, z4.h\n"
+    "ld1b { z24.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x45511b18  // usublb z24.h, z24.b, z17.b\n"
+    ".inst 0x44844373  // smlalb z19.s, p4/M, z27.h, z4.h\n"
+    "ldr x24, [x5, #0xf0]\n"
+    ".inst 0x44844769  // smlalt z9.s, p4/M, z27.h, z4.h\n"
+    "ld1b { z27.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x45511b7b  // usublb z27.h, z27.b, z17.b\n"
+    ".inst 0x448443c7  // smlalb z7.s, p4/M, z30.h, z4.h\n"
+    "ldr x23, [x5, #0xf8]\n"
+    ".inst 0x448447c6  // smlalt z6.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x4484434c  // smlalb z12.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x44844748  // smlalt z8.s, p4/M, z26.h, z4.h\n"
+    "ld1b { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    ".inst 0x448043ab  // smlalb z11.s, p4/M, z29.h, z0.h\n"
+    ".inst 0x448047b0  // smlalt z16.s, p4/M, z29.h, z0.h\n"
+    "uzp1 z29.s, z18.s, z20.s\n"
+    "uzp2 z20.s, z18.s, z20.s\n"
+    ".inst 0x44804393  // smlalb z19.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x44804789  // smlalt z9.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x448042c7  // smlalb z7.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x448046c6  // smlalt z6.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x4480432c  // smlalb z12.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804728  // smlalt z8.s, p4/M, z25.h, z0.h\n"
+    "ld1b { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x454d1800  // usublb z0.h, z0.b, z13.b\n"
+    ".inst 0x4481438b  // smlalb z11.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814790  // smlalt z16.s, p4/M, z28.h, z1.h\n"
+    "ld1b { z28.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x45511b9c  // usublb z28.h, z28.b, z17.b\n"
+    ".inst 0x448142f3  // smlalb z19.s, p4/M, z23.h, z1.h\n"
+    "ldr x22, [x5, #0x100]\n"
+    ".inst 0x448146e9  // smlalt z9.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x44814327  // smlalb z7.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x44814726  // smlalt z6.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x4481430c  // smlalb z12.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x44814708  // smlalt z8.s, p4/M, z24.h, z1.h\n"
+    "ld1b { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
+    ".inst 0x454d1821  // usublb z1.h, z1.b, z13.b\n"
+    ".inst 0x448242eb  // smlalb z11.s, p4/M, z23.h, z2.h\n"
+    ".inst 0x448246f0  // smlalt z16.s, p4/M, z23.h, z2.h\n"
+    "ld1b { z23.h }, p3/Z, [x15, x2]\n"
+    ".inst 0x45511af7  // usublb z23.h, z23.b, z17.b\n"
+    ".inst 0x448243f3  // smlalb z19.s, p4/M, z31.h, z2.h\n"
+    "ldr x21, [x5, #0x108]\n"
+    ".inst 0x448247e9  // smlalt z9.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x44824307  // smlalb z7.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x44824706  // smlalt z6.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x4482436c  // smlalb z12.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824768  // smlalt z8.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    ".inst 0x448343eb  // smlalb z11.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x448347f0  // smlalt z16.s, p4/M, z31.h, z3.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x45511bff  // usublb z31.h, z31.b, z17.b\n"
+    ".inst 0x448343d3  // smlalb z19.s, p4/M, z30.h, z3.h\n"
+    "ldr x20, [x5, #0x110]\n"
+    ".inst 0x448347c9  // smlalt z9.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x44834367  // smlalb z7.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834766  // smlalt z6.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x448342ec  // smlalb z12.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x448346e8  // smlalt z8.s, p4/M, z23.h, z3.h\n"
+    "ld1b { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    ".inst 0x448443cb  // smlalb z11.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x448447d0  // smlalt z16.s, p4/M, z30.h, z4.h\n"
+    "ld1b { z30.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x45511bde  // usublb z30.h, z30.b, z17.b\n"
+    ".inst 0x44844353  // smlalb z19.s, p4/M, z26.h, z4.h\n"
+    "ldr x19, [x5, #0x118]\n"
+    ".inst 0x44844749  // smlalt z9.s, p4/M, z26.h, z4.h\n"
+    "ld1b { z26.h }, p3/Z, [x14, x2]\n"
+    ".inst 0x45511b5a  // usublb z26.h, z26.b, z17.b\n"
+    ".inst 0x448442e7  // smlalb z7.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x448446e6  // smlalt z6.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x4484438c  // smlalb z12.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844788  // smlalt z8.s, p4/M, z28.h, z4.h\n"
+    "ld1b { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    ".inst 0x448042cb  // smlalb z11.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x448046d0  // smlalt z16.s, p4/M, z22.h, z0.h\n"
+    "ld1b { z22.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x45511ad6  // usublb z22.h, z22.b, z17.b\n"
+    ".inst 0x44804333  // smlalb z19.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804729  // smlalt z9.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x448043e7  // smlalb z7.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448047e6  // smlalt z6.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448043cc  // smlalb z12.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x448047c8  // smlalt z8.s, p4/M, z30.h, z0.h\n"
+    "ld1b { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    ".inst 0x4481432b  // smlalb z11.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x454d1800  // usublb z0.h, z0.b, z13.b\n"
+    ".inst 0x44814730  // smlalt z16.s, p4/M, z25.h, z1.h\n"
+    "ld1b { z25.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x44814313  // smlalb z19.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x45511b39  // usublb z25.h, z25.b, z17.b\n"
+    ".inst 0x44814709  // smlalt z9.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x448143c7  // smlalb z7.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x448147c6  // smlalt z6.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x4481434c  // smlalb z12.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x44814748  // smlalt z8.s, p4/M, z26.h, z1.h\n"
+    "ld1b { z1.h }, p4/Z, [x1]\n"
+    ".inst 0x454d1821  // usublb z1.h, z1.b, z13.b\n"
+    ".inst 0x4482430b  // smlalb z11.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x44824710  // smlalt z16.s, p4/M, z24.h, z2.h\n"
+    "ld1b { z24.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x45511b18  // usublb z24.h, z24.b, z17.b\n"
+    ".inst 0x44824373  // smlalb z19.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824769  // smlalt z9.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824347  // smlalb z7.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824746  // smlalt z6.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x4482432c  // smlalb z12.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x44824728  // smlalt z8.s, p4/M, z25.h, z2.h\n"
+    "ld1b { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    ".inst 0x4483436b  // smlalb z11.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834770  // smlalt z16.s, p4/M, z27.h, z3.h\n"
+    "ld1b { z27.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x45511b7b  // usublb z27.h, z27.b, z17.b\n"
+    ".inst 0x448342f3  // smlalb z19.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x448346e9  // smlalt z9.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x44834327  // smlalb z7.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834726  // smlalt z6.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x4483430c  // smlalb z12.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834708  // smlalt z8.s, p4/M, z24.h, z3.h\n"
+    "ld1b { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    ".inst 0x448442eb  // smlalb z11.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x448446f0  // smlalt z16.s, p4/M, z23.h, z4.h\n"
+    "ld1b { z23.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x45511af7  // usublb z23.h, z23.b, z17.b\n"
+    ".inst 0x44844393  // smlalb z19.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844789  // smlalt z9.s, p4/M, z28.h, z4.h\n"
+    "ld1b { z28.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x45511b9c  // usublb z28.h, z28.b, z17.b\n"
+    ".inst 0x44844307  // smlalb z7.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x44844706  // smlalt z6.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x448442cc  // smlalb z12.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x448446c8  // smlalt z8.s, p4/M, z22.h, z4.h\n"
+    "ld1b { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    ".inst 0x448043eb  // smlalb z11.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448047f0  // smlalt z16.s, p4/M, z31.h, z0.h\n"
+    "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x45511bff  // usublb z31.h, z31.b, z17.b\n"
+    ".inst 0x448043d3  // smlalb z19.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x448047c9  // smlalt z9.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x44804367  // smlalb z7.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x44804766  // smlalt z6.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x448042ec  // smlalb z12.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x448046e8  // smlalt z8.s, p4/M, z23.h, z0.h\n"
+    "ld1b { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x454d1800  // usublb z0.h, z0.b, z13.b\n"
+    ".inst 0x448143cb  // smlalb z11.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x448147d0  // smlalt z16.s, p4/M, z30.h, z1.h\n"
+    "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x45511bde  // usublb z30.h, z30.b, z17.b\n"
+    ".inst 0x44814353  // smlalb z19.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x44814749  // smlalt z9.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x448142e7  // smlalb z7.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448146e6  // smlalt z6.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448143ec  // smlalb z12.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147e8  // smlalt z8.s, p4/M, z31.h, z1.h\n"
+    "ld1b { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x454d1821  // usublb z1.h, z1.b, z13.b\n"
+    ".inst 0x4482434b  // smlalb z11.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824750  // smlalt z16.s, p4/M, z26.h, z2.h\n"
+    "ld1b { z26.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x45511b5a  // usublb z26.h, z26.b, z17.b\n"
+    ".inst 0x44824333  // smlalb z19.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x44824729  // smlalt z9.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x448243e7  // smlalb z7.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448247e6  // smlalt z6.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448243cc  // smlalb z12.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x448247c8  // smlalt z8.s, p4/M, z30.h, z2.h\n"
+    "ld1b { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    ".inst 0x4483432b  // smlalb z11.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834730  // smlalt z16.s, p4/M, z25.h, z3.h\n"
+    "ld1b { z25.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x45511b39  // usublb z25.h, z25.b, z17.b\n"
+    ".inst 0x44834313  // smlalb z19.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834709  // smlalt z9.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x448343c7  // smlalb z7.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x448347c6  // smlalt z6.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x4483438c  // smlalb z12.s, p4/M, z28.h, z3.h\n"
+    ".inst 0x44834788  // smlalt z8.s, p4/M, z28.h, z3.h\n"
+    "ld1b { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    ".inst 0x4484430b  // smlalb z11.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    ".inst 0x44844710  // smlalt z16.s, p4/M, z24.h, z4.h\n"
+    "ld1b { z24.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x448442d3  // smlalb z19.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x45511b18  // usublb z24.h, z24.b, z17.b\n"
+    ".inst 0x448446c9  // smlalt z9.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x44844387  // smlalb z7.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844786  // smlalt z6.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x4484434c  // smlalb z12.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x44844748  // smlalt z8.s, p4/M, z26.h, z4.h\n"
+    "ld1b { z4.h }, p4/Z, [x1]\n"
+    "inch x1\n"
+    ".inst 0x4480436b  // smlalb z11.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    ".inst 0x44804770  // smlalt z16.s, p4/M, z27.h, z0.h\n"
+    "ld1b { z27.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x448042f3  // smlalb z19.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x45511b7b  // usublb z27.h, z27.b, z17.b\n"
+    ".inst 0x448046e9  // smlalt z9.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x44804327  // smlalb z7.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804726  // smlalt z6.s, p4/M, z25.h, z0.h\n"
+    "ld1b { z25.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x45511b39  // usublb z25.h, z25.b, z17.b\n"
+    ".inst 0x4480430c  // smlalb z12.s, p4/M, z24.h, z0.h\n"
+    ".inst 0x44804708  // smlalt z8.s, p4/M, z24.h, z0.h\n"
+    ".inst 0x448142eb  // smlalb z11.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448146f0  // smlalt z16.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448143f3  // smlalb z19.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147e9  // smlalt z9.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x44814307  // smlalb z7.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x44814706  // smlalt z6.s, p4/M, z24.h, z1.h\n"
+    "ld1b { z24.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x45511b18  // usublb z24.h, z24.b, z17.b\n"
+    ".inst 0x4481436c  // smlalb z12.s, p4/M, z27.h, z1.h\n"
+    ".inst 0x44814768  // smlalt z8.s, p4/M, z27.h, z1.h\n"
+    ".inst 0x448243eb  // smlalb z11.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448247f0  // smlalt z16.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448243d3  // smlalb z19.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x448247c9  // smlalt z9.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824367  // smlalb z7.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824766  // smlalt z6.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z27.h }, p3/Z, [x19, x2]\n"
+    "inch x2\n"
+    ".inst 0x4482432c  // smlalb z12.s, p4/M, z25.h, z2.h\n"
+    "whilelt p2.s, x2, x0\n"
+    ".inst 0x44824728  // smlalt z8.s, p4/M, z25.h, z2.h\n"
+    "mov x19, x2\n"
+    ".inst 0x448343cb  // smlalb z11.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x45511b7b  // usublb z27.h, z27.b, z17.b\n"
+    ".inst 0x448347d0  // smlalt z16.s, p4/M, z30.h, z3.h\n"
+    "incw x19\n"
+    ".inst 0x44834393  // smlalb z19.s, p4/M, z28.h, z3.h\n"
+    "whilelt p1.s, x19, x0\n"
+    ".inst 0x44834789  // smlalt z9.s, p4/M, z28.h, z3.h\n"
+    "whilelt p3.h, x2, x0\n"
+    ".inst 0x44834327  // smlalb z7.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834726  // smlalt z6.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x4483430c  // smlalb z12.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834708  // smlalt z8.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x4484438b  // smlalb z11.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844790  // smlalt z16.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844353  // smlalb z19.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x44844749  // smlalt z9.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x04b5756b  // sqrdmulh z11.s, z11.s, z21.s\n"
+    ".inst 0x04aa7610  // sqrdmulh z16.s, z16.s, z10.s\n"
+    ".inst 0x04b57673  // sqrdmulh z19.s, z19.s, z21.s\n"
+    ".inst 0x04aa7529  // sqrdmulh z9.s, z9.s, z10.s\n"
+    "and z31.d, z11.d, z29.d\n"
+    "asr z31.s, z31.s, #0x1f\n"
+    "and z23.d, z16.d, z20.d\n"
+    "and z25.d, z19.d, z29.d\n"
+    "asr z23.s, z23.s, #0x1f\n"
+    "and z18.d, z9.d, z20.d\n"
+    ".inst 0x44844307  // smlalb z7.s, p4/M, z24.h, z4.h\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    ".inst 0x44844706  // smlalt z6.s, p4/M, z24.h, z4.h\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z11.s, z11.s, z31.s\n"
+    ".inst 0x4484436c  // smlalb z12.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x04b574e7  // sqrdmulh z7.s, z7.s, z21.s\n"
+    "sqadd z16.s, z16.s, z23.s\n"
+    "sqadd z19.s, z19.s, z25.s\n"
+    ".inst 0x04aa74c6  // sqrdmulh z6.s, z6.s, z10.s\n"
+    "sqadd z9.s, z9.s, z18.s\n"
+    "and z1.d, z7.d, z29.d\n"
+    "asr z1.s, z1.s, #0x1f\n"
+    "and z18.d, z6.d, z20.d\n"
+    ".inst 0x04b5758c  // sqrdmulh z12.s, z12.s, z21.s\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x44844768  // smlalt z8.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x448293ab  // srshl z11.s, p4/M, z11.s, z29.s\n"
+    "and z30.d, z12.d, z29.d\n"
+    "asr z30.s, z30.s, #0x1f\n"
+    "add z11.s, z11.s, z14.s\n"
+    "sqadd z7.s, z7.s, z1.s\n"
+    "sqadd z6.s, z6.s, z18.s\n"
+    ".inst 0x04aa7508  // sqrdmulh z8.s, z8.s, z10.s\n"
+    "smin z11.s, p4/M, z11.s, z15.s\n"
+    ".inst 0x44829290  // srshl z16.s, p4/M, z16.s, z20.s\n"
+    "sqadd z12.s, z12.s, z30.s\n"
+    "and z3.d, z8.d, z20.d\n"
+    "asr z3.s, z3.s, #0x1f\n"
+    "add z16.s, z16.s, z14.s\n"
+    "smax z11.s, p4/M, z11.s, z5.s\n"
+    ".inst 0x448293b3  // srshl z19.s, p4/M, z19.s, z29.s\n"
+    ".inst 0x44829289  // srshl z9.s, p4/M, z9.s, z20.s\n"
+    "smin z16.s, p4/M, z16.s, z15.s\n"
+    ".inst 0x448293a7  // srshl z7.s, p4/M, z7.s, z29.s\n"
+    "add z19.s, z19.s, z14.s\n"
+    "add z9.s, z9.s, z14.s\n"
+    "sqadd z8.s, z8.s, z3.s\n"
+    "add z7.s, z7.s, z14.s\n"
+    "smax z16.s, p4/M, z16.s, z5.s\n"
+    "smin z19.s, p4/M, z19.s, z15.s\n"
+    "smin z9.s, p4/M, z9.s, z15.s\n"
+    "smin z7.s, p4/M, z7.s, z15.s\n"
+    "trn1 z11.h, z11.h, z16.h\n"
+    "st1b { z11.h }, p0, [x7, x3]\n"
+    "smax z19.s, p4/M, z19.s, z5.s\n"
+    "smax z9.s, p4/M, z9.s, z5.s\n"
+    "smax z7.s, p4/M, z7.s, z5.s\n"
+    ".inst 0x44829286  // srshl z6.s, p4/M, z6.s, z20.s\n"
+    ".inst 0x448293ac  // srshl z12.s, p4/M, z12.s, z29.s\n"
+    "trn1 z19.h, z19.h, z9.h\n"
+    "st1b { z19.h }, p0, [x8, x3]\n"
+    "add z6.s, z6.s, z14.s\n"
+    ".inst 0x44829288  // srshl z8.s, p4/M, z8.s, z20.s\n"
+    "add z12.s, z12.s, z14.s\n"
+    "smin z6.s, p4/M, z6.s, z15.s\n"
+    "add z8.s, z8.s, z14.s\n"
+    "smin z12.s, p4/M, z12.s, z15.s\n"
+    "smax z6.s, p4/M, z6.s, z5.s\n"
+    "smin z8.s, p4/M, z8.s, z15.s\n"
+    "smax z12.s, p4/M, z12.s, z5.s\n"
+    "trn1 z7.h, z7.h, z6.h\n"
+    "st1b { z7.h }, p0, [x17, x3]\n"
+    "smax z8.s, p4/M, z8.s, z5.s\n"
+    "trn1 z12.h, z12.h, z8.h\n"
+    "st1b { z12.h }, p0, [x16, x3]\n"
+    "inch x3\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z19.s }, p2/Z, [x19]\n"
+    "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z11.s, z19.s, z6.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z16.s, z19.s, z6.s\n"
+    "mov z19.d, z11.d\n"
+    "ld1b { z0.h }, p4/Z, [x1]\n"
+    ".inst 0x454d1800  // usublb z0.h, z0.b, z13.b\n"
+    "mov z9.d, z16.d\n"
+    "ld1b { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+    "mov z7.d, z11.d\n"
+    "ld1b { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x454d1821  // usublb z1.h, z1.b, z13.b\n"
+    "mov z6.d, z16.d\n"
+    "ld1b { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+    "mov z12.d, z11.d\n"
+    "ld1b { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x454d1842  // usublb z2.h, z2.b, z13.b\n"
+    "mov z8.d, z16.d\n"
+    "ldp x28, x27, [x5, #0x0]\n"
+    ".inst 0x454d1863  // usublb z3.h, z3.b, z13.b\n"
+    "ldp x26, x25, [x5, #0x10]\n"
+    ".inst 0x454d1884  // usublb z4.h, z4.b, z13.b\n"
+    "ldp x24, x23, [x5, #0x20]\n"
+    "ldp x22, x21, [x5, #0x30]\n"
+    "ldp x20, x19, [x5, #0x40]\n"
+    "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x45511bff  // usublb z31.h, z31.b, z17.b\n"
+    "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x45511bde  // usublb z30.h, z30.b, z17.b\n"
+    "ld1b { z28.h }, p3/Z, [x25, x2]\n"
+    "ld1b { z27.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x45511bbd  // usublb z29.h, z29.b, z17.b\n"
+    "ld1b { z23.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x45511b9c  // usublb z28.h, z28.b, z17.b\n"
+    "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+    "ld1b { z24.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x45511b7b  // usublb z27.h, z27.b, z17.b\n"
+    "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x45511af7  // usublb z23.h, z23.b, z17.b\n"
+    "ld1b { z22.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x45511b39  // usublb z25.h, z25.b, z17.b\n"
+    ".inst 0x45511b18  // usublb z24.h, z24.b, z17.b\n"
+    ".inst 0x45511b5a  // usublb z26.h, z26.b, z17.b\n"
+    ".inst 0x45511ad6  // usublb z22.h, z22.b, z17.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000..9226a96
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+{
+  typedef uint32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 4;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 9;
+  constexpr static unsigned int input_col_quads = 1;
+
+  kern_type kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+
+  sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..bb9931c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp

@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov z31.s, #0x0\n"
+    "ldr x24, [%x[inptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "mov z18.s, #0x0\n"
+    "ldr x23, [%x[inptrs], #0x8]\n"
+    "lsl x9, %x[n_channels], #0x2\n"
+    "mov z29.s, #0x0\n"
+    "ldr x22, [%x[inptrs], #0x10]\n"
+    "addvl SP, SP, #-8\n"
+    "mov z28.s, #0x0\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "mov x19, #0x9\n"
+    "mov z13.s, #0x0\n"
+    "ldr x20, [%x[inptrs], #0x20]\n"
+    "whilelt p1.b, XZR, x19\n"
+    "mov z14.s, #0x0\n"
+    "ld1b { z7.b }, p1/Z, [x24]\n"
+    "mov x19, #0x3\n"
+    "mov z15.s, #0x0\n"
+    "ld1b { z3.b }, p1/Z, [x23]\n"
+    "whilelt p0.b, XZR, x19\n"
+    "mov z11.b, p0/z, #0x1\n"
+    "ld1b { z4.b }, p1/Z, [x22]\n"
+    "mov x28, #0x0\n"
+    "mov z10.d, z7.d\n"
+    "ld1b { z6.b }, p1/Z, [x21]\n"
+    "mov x27, #0x0\n"
+    "ext z10.b, z10.b, z10.b, #0x2\n"
+    "ld1b { z5.b }, p1/Z, [x20]\n"
+    "whilelt p1.b, x28, x9\n"
+    "mov z17.d, z7.d\n"
+    "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z26.d, z7.d\n"
+    "ldp x26, x25, [%x[outptrs], #0x0]\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "ldp x24, x23, [%x[outptrs], #0x10]\n"
+    "ext z26.b, z26.b, z26.b, #0x6\n"
+    "ldp x22, x21, [%x[outptrs], #0x20]\n"
+    "mov z19.d, z3.d\n"
+    "ldp x20, x19, [%x[outptrs], #0x30]\n"
+    "ext z19.b, z19.b, z19.b, #0x2\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip1 z7.s, z7.s, z17.s\n"
+    "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "zip1 z10.s, z10.s, z26.s\n"
+    "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "zip1 z7.s, z7.s, z10.s\n"
+    "ld1w { z1.s }, p1/Z, [%x[params]]\n"
+    "mov z7.q, z7.q[0]\n"
+    "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "mov z17.d, z3.d\n"
+    "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "addvl %x[params], %x[params], #4\n"
+    "mov z2.d, z3.d\n"
+    "mov z20.d, z4.d\n"
+    "ext z2.b, z2.b, z2.b, #0x6\n"
+    "zip1 z3.s, z3.s, z17.s\n"
+    "ext z20.b, z20.b, z20.b, #0x2\n"
+    "mov z17.d, z4.d\n"
+    "zip1 z19.s, z19.s, z2.s\n"
+    "zip1 z3.s, z3.s, z19.s\n"
+    "mov z3.q, z3.q[0]\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "mov z26.d, z4.d\n"
+    "ext z26.b, z26.b, z26.b, #0x6\n"
+    "mov z21.d, z6.d\n"
+    "zip1 z4.s, z4.s, z17.s\n"
+    "ext z21.b, z21.b, z21.b, #0x2\n"
+    "zip1 z20.s, z20.s, z26.s\n"
+    "zip1 z4.s, z4.s, z20.s\n"
+    "mov z4.q, z4.q[0]\n"
+    "mov z17.d, z6.d\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "mov z20.d, z6.d\n"
+    "ext z20.b, z20.b, z20.b, #0x6\n"
+    "mov z19.d, z5.d\n"
+    "zip1 z6.s, z6.s, z17.s\n"
+    "ext z19.b, z19.b, z19.b, #0x2\n"
+    "zip1 z21.s, z21.s, z20.s\n"
+    "zip1 z6.s, z6.s, z21.s\n"
+    "mov z6.q, z6.q[0]\n"
+    "mov z17.d, z5.d\n"
+    "ext z17.b, z17.b, z17.b, #0x4\n"
+    "mov z20.d, z5.d\n"
+    "ext z20.b, z20.b, z20.b, #0x6\n"
+    "mov z11.s, z11.s[0]\n"
+    "zip1 z5.s, z5.s, z17.s\n"
+    "mov z25.s, #0x0\n"
+    "zip1 z19.s, z19.s, z20.s\n"
+    "zip1 z5.s, z5.s, z19.s\n"
+    "mov z5.q, z5.q[0]\n"
+    "mov z26.s, #0x0\n"
+    "mov z27.s, #0x0\n"
+    "mov z24.s, #0x0\n"
+    "mov z23.s, #0x0\n"
+    "mov z22.s, #0x0\n"
+    "mov z21.s, #0x0\n"
+    "mov z17.s, #0x0\n"
+    "mov z20.s, #0x0\n"
+    "mov z2.s, #0x0\n"
+    "mov z19.s, #0x0\n"
+    "udot z31.s, z11.b, z7.b[0]\n"
+    "udot z18.s, z11.b, z7.b[1]\n"
+    "udot z29.s, z11.b, z7.b[2]\n"
+    "udot z28.s, z11.b, z7.b[3]\n"
+    "udot z13.s, z11.b, z3.b[0]\n"
+    "udot z14.s, z11.b, z3.b[1]\n"
+    "udot z15.s, z11.b, z3.b[2]\n"
+    "udot z25.s, z11.b, z3.b[3]\n"
+    "udot z26.s, z11.b, z4.b[0]\n"
+    "udot z27.s, z11.b, z4.b[1]\n"
+    "udot z24.s, z11.b, z4.b[2]\n"
+    "udot z23.s, z11.b, z4.b[3]\n"
+    "udot z22.s, z11.b, z6.b[0]\n"
+    "udot z21.s, z11.b, z6.b[1]\n"
+    "udot z17.s, z11.b, z6.b[2]\n"
+    "udot z20.s, z11.b, z6.b[3]\n"
+    "udot z2.s, z11.b, z5.b[0]\n"
+    "udot z19.s, z11.b, z5.b[1]\n"
+    "mov z31.d, z31.d\n"
+    "mov z18.d, z18.d\n"
+    "mov z29.d, z29.d\n"
+    "mov z28.d, z28.d\n"
+    "add z31.s, z31.s, z13.s\n"
+    "mov z13.s, #0x0\n"
+    "udot z13.s, z11.b, z5.b[2]\n"
+    "add z18.s, z18.s, z14.s\n"
+    "mov z14.s, #0x0\n"
+    "udot z14.s, z11.b, z5.b[3]\n"
+    "add z29.s, z29.s, z15.s\n"
+    "add z28.s, z28.s, z25.s\n"
+    "add z31.s, z31.s, z26.s\n"
+    "add z18.s, z18.s, z27.s\n"
+    "add z29.s, z29.s, z24.s\n"
+    "add z28.s, z28.s, z23.s\n"
+    "mov z26.d, z26.d\n"
+    "mov z25.d, z27.d\n"
+    "mov z24.d, z24.d\n"
+    "mov z23.d, z23.d\n"
+    "add z26.s, z26.s, z22.s\n"
+    "add z25.s, z25.s, z21.s\n"
+    "add z24.s, z24.s, z17.s\n"
+    "add z23.s, z23.s, z20.s\n"
+    "add z26.s, z26.s, z2.s\n"
+    "add z25.s, z25.s, z19.s\n"
+    "add z24.s, z24.s, z13.s\n"
+    "add z23.s, z23.s, z14.s\n"
+    "neg z30.s, p2/M, z30.s\n"
+    "mul z31.s, p2/M, z31.s, z30.s\n"
+    "st1w { z31.s }, p2, [SP]\n"
+    "add z31.s, z31.s, z1.s\n"
+    "mul z18.s, p2/M, z18.s, z30.s\n"
+    "st1w { z18.s }, p2, [SP, #1, MUL VL]\n"
+    "add z18.s, z18.s, z1.s\n"
+    "mul z29.s, p2/M, z29.s, z30.s\n"
+    "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
+    "add z29.s, z29.s, z1.s\n"
+    "mul z28.s, p2/M, z28.s, z30.s\n"
+    "st1w { z28.s }, p2, [SP, #3, MUL VL]\n"
+    "add z28.s, z28.s, z1.s\n"
+    "mul z26.s, p2/M, z26.s, z30.s\n"
+    "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
+    "add z26.s, z26.s, z1.s\n"
+    "mul z25.s, p2/M, z25.s, z30.s\n"
+    "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
+    "add z25.s, z25.s, z1.s\n"
+    "mul z24.s, p2/M, z24.s, z30.s\n"
+    "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
+    "add z24.s, z24.s, z1.s\n"
+    "mul z23.s, p2/M, z23.s, z30.s\n"
+    "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
+    "add z23.s, z23.s, z1.s\n"
+    "1:"  // Loop
+    "udot z31.s, z8.b, z7.b[0]\n"
+    "ld1w { z22.s }, p2/Z, [%x[params]]\n"
+    "incb x28\n"
+    "udot z18.s, z8.b, z7.b[1]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "whilelt p0.s, x27, %x[n_channels]\n"
+    "udot z29.s, z8.b, z7.b[2]\n"
+    "whilelt p1.b, x28, x9\n"
+    "ld1w { z1.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "udot z28.s, z8.b, z7.b[3]\n"
+    "udot z26.s, z8.b, z4.b[0]\n"
+    "udot z25.s, z8.b, z4.b[1]\n"
+    "udot z24.s, z8.b, z4.b[2]\n"
+    "udot z23.s, z8.b, z4.b[3]\n"
+    "ld1b { z8.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "udot z31.s, z9.b, z3.b[0]\n"
+    "udot z18.s, z9.b, z3.b[1]\n"
+    "udot z29.s, z9.b, z3.b[2]\n"
+    "udot z28.s, z9.b, z3.b[3]\n"
+    "udot z26.s, z9.b, z6.b[0]\n"
+    "udot z25.s, z9.b, z6.b[1]\n"
+    "udot z24.s, z9.b, z6.b[2]\n"
+    "udot z23.s, z9.b, z6.b[3]\n"
+    "ld1b { z9.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "udot z31.s, z10.b, z4.b[0]\n"
+    "udot z18.s, z10.b, z4.b[1]\n"
+    "udot z29.s, z10.b, z4.b[2]\n"
+    "udot z28.s, z10.b, z4.b[3]\n"
+    "udot z26.s, z10.b, z5.b[0]\n"
+    "udot z25.s, z10.b, z5.b[1]\n"
+    "udot z24.s, z10.b, z5.b[2]\n"
+    "udot z23.s, z10.b, z5.b[3]\n"
+    "ld1b { z10.b }, p1/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #6\n"
+    ".inst 0x04b677ff  // sqrdmulh z31.s, z31.s, z22.s\n"
+    ".inst 0x04b67652  // sqrdmulh z18.s, z18.s, z22.s\n"
+    ".inst 0x04b677bd  // sqrdmulh z29.s, z29.s, z22.s\n"
+    ".inst 0x04b6779c  // sqrdmulh z28.s, z28.s, z22.s\n"
+    ".inst 0x04b6775a  // sqrdmulh z26.s, z26.s, z22.s\n"
+    "and z20.d, z31.d, z21.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z19.d, z18.d, z21.d\n"
+    "and z14.d, z29.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z17.d, z28.d, z21.d\n"
+    "and z2.d, z26.d, z21.d\n"
+    "asr z14.s, z14.s, #0x1f\n"
+    ".inst 0x04b67739  // sqrdmulh z25.s, z25.s, z22.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sqadd z31.s, z31.s, z20.s\n"
+    ".inst 0x04b67718  // sqrdmulh z24.s, z24.s, z22.s\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    ".inst 0x04b676f7  // sqrdmulh z23.s, z23.s, z22.s\n"
+    "sqadd z18.s, z18.s, z19.s\n"
+    "sqadd z29.s, z29.s, z14.s\n"
+    "and z27.d, z25.d, z21.d\n"
+    "asr z27.s, z27.s, #0x1f\n"
+    "sqadd z28.s, z28.s, z17.s\n"
+    "sqadd z26.s, z26.s, z2.s\n"
+    "and z17.d, z24.d, z21.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "and z15.d, z23.d, z21.d\n"
+    ".inst 0x44828abf  // srshl z31.s, p2/M, z31.s, z21.s\n"
+    "asr z15.s, z15.s, #0x1f\n"
+    "sqadd z25.s, z25.s, z27.s\n"
+    ".inst 0x44828ab2  // srshl z18.s, p2/M, z18.s, z21.s\n"
+    "add z31.s, z31.s, z12.s\n"
+    "sqadd z24.s, z24.s, z17.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "add z18.s, z18.s, z12.s\n"
+    "sqadd z23.s, z23.s, z15.s\n"
+    "smin z31.s, p2/M, z31.s, z0.s\n"
+    "add z29.s, z29.s, z12.s\n"
+    "smin z18.s, p2/M, z18.s, z0.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    "smax z31.s, p2/M, z31.s, z16.s\n"
+    "st1b { z31.s }, p0, [x26, x27]\n"
+    "add z28.s, z28.s, z12.s\n"
+    "smax z18.s, p2/M, z18.s, z16.s\n"
+    "ld1w { z31.s }, p2/Z, [SP]\n"
+    "smin z29.s, p2/M, z29.s, z0.s\n"
+    "st1b { z18.s }, p0, [x25, x27]\n"
+    "add z31.s, z31.s, z1.s\n"
+    "smin z28.s, p2/M, z28.s, z0.s\n"
+    "ld1w { z18.s }, p2/Z, [SP, #1, MUL VL]\n"
+    "smax z29.s, p2/M, z29.s, z16.s\n"
+    "st1b { z29.s }, p0, [x24, x27]\n"
+    "add z18.s, z18.s, z1.s\n"
+    "smax z28.s, p2/M, z28.s, z16.s\n"
+    "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    "st1b { z28.s }, p0, [x23, x27]\n"
+    "add z29.s, z29.s, z1.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "ld1w { z28.s }, p2/Z, [SP, #3, MUL VL]\n"
+    "add z26.s, z26.s, z12.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab7  // srshl z23.s, p2/M, z23.s, z21.s\n"
+    "add z25.s, z25.s, z12.s\n"
+    "add z28.s, z28.s, z1.s\n"
+    "add z24.s, z24.s, z12.s\n"
+    "add z23.s, z23.s, z12.s\n"
+    "smin z26.s, p2/M, z26.s, z0.s\n"
+    "smin z25.s, p2/M, z25.s, z0.s\n"
+    "smin z24.s, p2/M, z24.s, z0.s\n"
+    "smin z23.s, p2/M, z23.s, z0.s\n"
+    "smax z26.s, p2/M, z26.s, z16.s\n"
+    "st1b { z26.s }, p0, [x22, x27]\n"
+    "smax z25.s, p2/M, z25.s, z16.s\n"
+    "smax z24.s, p2/M, z24.s, z16.s\n"
+    "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
+    "smax z23.s, p2/M, z23.s, z16.s\n"
+    "st1b { z25.s }, p0, [x21, x27]\n"
+    "add z26.s, z26.s, z1.s\n"
+    "st1b { z24.s }, p0, [x20, x27]\n"
+    "st1b { z23.s }, p0, [x19, x27]\n"
+    "incw x27\n"
+    "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
+    "add z25.s, z25.s, z1.s\n"
+    "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
+    "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
+    "add z24.s, z24.s, z1.s\n"
+    "add z23.s, z23.s, z1.s\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #8\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000..3023ed1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+{
+  typedef uint32_t bias_type;
+  typedef uint8_t input_type;
+  typedef uint8_t weight_type;
+  typedef uint8_t return_type;
+
+  typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 4;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 8;
+  constexpr static unsigned int input_cols = 6;
+  constexpr static unsigned int input_col_quads = 1;
+
+  kern_type kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+
+  sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000..fc1e23e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp

@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FEATURE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+  const uint8_t *const *const inptrs,
+  uint8_t *const *const outptrs,
+  const void *params,
+  unsigned int n_output_channels,
+  const arm_gemm::Requantize32& qp
+)
+{
+  __asm__ __volatile__(
+    "mov z20.b, #0x1\n"
+    "ldr x24, [%x[inptrs], #0x0]\n"
+    "ptrue p2.b\n"
+    "mov z22.s, #0x1\n"
+    "ldr x23, [%x[inptrs], #0x8]\n"
+    "lsl x9, %x[n_channels], #0x2\n"
+    "mov z30.s, #0x0\n"
+    "ldr x22, [%x[inptrs], #0x10]\n"
+    "addvl SP, SP, #-8\n"
+    "mov z28.s, #0x0\n"
+    "ldr x21, [%x[inptrs], #0x18]\n"
+    "mov x20, #0x6\n"
+    "mov z29.s, #0x0\n"
+    "ldr x19, [%x[inptrs], #0x20]\n"
+    "whilelt p0.b, XZR, x20\n"
+    "mov z27.s, #0x0\n"
+    "ld1b { z0.b }, p0/Z, [x24]\n"
+    "mov x28, #0x0\n"
+    "mov z26.s, #0x0\n"
+    "ld1b { z3.b }, p0/Z, [x23]\n"
+    "mov x27, #0x0\n"
+    "mov z25.s, #0x0\n"
+    "ld1b { z5.b }, p0/Z, [x22]\n"
+    "whilelt p1.b, x28, x9\n"
+    "mov z15.d, z0.d\n"
+    "ld1b { z4.b }, p0/Z, [x21]\n"
+    "mov z24.s, #0x0\n"
+    "ld1b { z6.b }, p0/Z, [x19]\n"
+    "ext z15.b, z15.b, z15.b, #0x1\n"
+    "ldr x21, [%x[inptrs], #0x28]\n"
+    "mov z16.d, z3.d\n"
+    "ldr x20, [%x[inptrs], #0x30]\n"
+    "ext z16.b, z16.b, z16.b, #0x1\n"
+    "ldr x19, [%x[inptrs], #0x38]\n"
+    "mov z18.d, z5.d\n"
+    "ld1b { z7.b }, p0/Z, [x21]\n"
+    "zip1 z0.d, z0.d, z15.d\n"
+    "ld1b { z1.b }, p0/Z, [x20]\n"
+    "mov z0.q, z0.q[0]\n"
+    "ld1b { z2.b }, p0/Z, [x19]\n"
+    "zip1 z3.d, z3.d, z16.d\n"
+    "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+    "mov z3.q, z3.q[0]\n"
+    "ldp x26, x25, [%x[outptrs], #0x0]\n"
+    "ext z18.b, z18.b, z18.b, #0x1\n"
+    "ldp x24, x23, [%x[outptrs], #0x10]\n"
+    "mov z16.d, z4.d\n"
+    "ldp x22, x21, [%x[outptrs], #0x20]\n"
+    "ext z16.b, z16.b, z16.b, #0x1\n"
+    "ldp x20, x19, [%x[outptrs], #0x30]\n"
+    "mov z17.d, z6.d\n"
+    "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+    "zip1 z5.d, z5.d, z18.d\n"
+    "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+    "mov z5.q, z5.q[0]\n"
+    "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+    "zip1 z4.d, z4.d, z16.d\n"
+    "ld1w { z13.s }, p1/Z, [%x[params]]\n"
+    "mov z4.q, z4.q[0]\n"
+    "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n"
+    "ext z17.b, z17.b, z17.b, #0x1\n"
+    "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n"
+    "mov z16.d, z7.d\n"
+    "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n"
+    "ext z16.b, z16.b, z16.b, #0x1\n"
+    "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n"
+    "addvl %x[params], %x[params], #5\n"
+    "zip1 z6.d, z6.d, z17.d\n"
+    "mov z17.d, z1.d\n"
+    "mov z6.q, z6.q[0]\n"
+    "zip1 z7.d, z7.d, z16.d\n"
+    "mov z7.q, z7.q[0]\n"
+    "ext z17.b, z17.b, z17.b, #0x1\n"
+    "mov z16.d, z2.d\n"
+    "ext z16.b, z16.b, z16.b, #0x1\n"
+    "mov z23.s, #0x0\n"
+    "zip1 z1.d, z1.d, z17.d\n"
+    "mov z1.q, z1.q[0]\n"
+    "zip1 z2.d, z2.d, z16.d\n"
+    "mov z2.q, z2.q[0]\n"
+    "mov z18.s, #0x0\n"
+    "mov z17.s, #0x0\n"
+    "mov z16.s, #0x0\n"
+    "mov z21.s, #0x0\n"
+    "mov z19.s, #0x0\n"
+    "udot z30.s, z20.b, z0.b[0]\n"
+    "udot z28.s, z20.b, z0.b[2]\n"
+    "udot z29.s, z20.b, z3.b[0]\n"
+    "udot z27.s, z20.b, z3.b[2]\n"
+    "udot z30.s, z22.b, z0.b[1]\n"
+    "udot z28.s, z22.b, z0.b[3]\n"
+    "udot z29.s, z22.b, z3.b[1]\n"
+    "udot z27.s, z22.b, z3.b[3]\n"
+    "udot z26.s, z20.b, z5.b[0]\n"
+    "udot z25.s, z20.b, z5.b[2]\n"
+    "udot z24.s, z20.b, z4.b[0]\n"
+    "udot z23.s, z20.b, z4.b[2]\n"
+    "udot z26.s, z22.b, z5.b[1]\n"
+    "udot z25.s, z22.b, z5.b[3]\n"
+    "udot z24.s, z22.b, z4.b[1]\n"
+    "udot z23.s, z22.b, z4.b[3]\n"
+    "udot z18.s, z20.b, z6.b[0]\n"
+    "udot z17.s, z20.b, z6.b[2]\n"
+    "udot z16.s, z20.b, z7.b[0]\n"
+    "udot z21.s, z20.b, z7.b[2]\n"
+    "udot z18.s, z22.b, z6.b[1]\n"
+    "udot z17.s, z22.b, z6.b[3]\n"
+    "udot z16.s, z22.b, z7.b[1]\n"
+    "udot z21.s, z22.b, z7.b[3]\n"
+    "udot z19.s, z20.b, z1.b[0]\n"
+    "mov z30.d, z30.d\n"
+    "mov z28.d, z28.d\n"
+    "add z30.s, z30.s, z29.s\n"
+    "udot z19.s, z22.b, z1.b[1]\n"
+    "add z28.s, z28.s, z27.s\n"
+    "add z30.s, z30.s, z26.s\n"
+    "mov z29.d, z29.d\n"
+    "add z28.s, z28.s, z25.s\n"
+    "add z30.s, z30.s, z24.s\n"
+    "mov z27.d, z27.d\n"
+    "add z28.s, z28.s, z23.s\n"
+    "add z30.s, z30.s, z18.s\n"
+    "add z29.s, z29.s, z26.s\n"
+    "add z28.s, z28.s, z17.s\n"
+    "add z27.s, z27.s, z25.s\n"
+    "add z29.s, z29.s, z24.s\n"
+    "mov z26.d, z26.d\n"
+    "add z27.s, z27.s, z23.s\n"
+    "add z29.s, z29.s, z18.s\n"
+    "mov z25.d, z25.d\n"
+    "add z27.s, z27.s, z17.s\n"
+    "add z29.s, z29.s, z16.s\n"
+    "add z26.s, z26.s, z24.s\n"
+    "add z27.s, z27.s, z21.s\n"
+    "add z25.s, z25.s, z23.s\n"
+    "add z26.s, z26.s, z18.s\n"
+    "mov z24.d, z24.d\n"
+    "add z25.s, z25.s, z17.s\n"
+    "add z26.s, z26.s, z16.s\n"
+    "mov z23.d, z23.d\n"
+    "add z25.s, z25.s, z21.s\n"
+    "add z26.s, z26.s, z19.s\n"
+    "add z24.s, z24.s, z18.s\n"
+    "mov z18.s, #0x0\n"
+    "udot z18.s, z20.b, z1.b[2]\n"
+    "add z23.s, z23.s, z17.s\n"
+    "mov z17.s, #0x0\n"
+    "udot z17.s, z20.b, z2.b[0]\n"
+    "udot z18.s, z22.b, z1.b[3]\n"
+    "add z24.s, z24.s, z16.s\n"
+    "mov z16.s, #0x0\n"
+    "udot z17.s, z22.b, z2.b[1]\n"
+    "udot z16.s, z20.b, z2.b[2]\n"
+    "add z25.s, z25.s, z18.s\n"
+    "add z23.s, z23.s, z21.s\n"
+    "add z24.s, z24.s, z19.s\n"
+    "udot z16.s, z22.b, z2.b[3]\n"
+    "add z23.s, z23.s, z18.s\n"
+    "add z24.s, z24.s, z17.s\n"
+    "neg z15.s, p2/M, z15.s\n"
+    "add z23.s, z23.s, z16.s\n"
+    "mul z30.s, p2/M, z30.s, z15.s\n"
+    "st1w { z30.s }, p2, [SP]\n"
+    "add z30.s, z30.s, z13.s\n"
+    "mul z28.s, p2/M, z28.s, z15.s\n"
+    "st1w { z28.s }, p2, [SP, #1, MUL VL]\n"
+    "add z28.s, z28.s, z13.s\n"
+    "mul z29.s, p2/M, z29.s, z15.s\n"
+    "st1w { z29.s }, p2, [SP, #2, MUL VL]\n"
+    "add z29.s, z29.s, z13.s\n"
+    "mul z27.s, p2/M, z27.s, z15.s\n"
+    "st1w { z27.s }, p2, [SP, #3, MUL VL]\n"
+    "add z27.s, z27.s, z13.s\n"
+    "mul z26.s, p2/M, z26.s, z15.s\n"
+    "st1w { z26.s }, p2, [SP, #4, MUL VL]\n"
+    "add z26.s, z26.s, z13.s\n"
+    "mul z25.s, p2/M, z25.s, z15.s\n"
+    "st1w { z25.s }, p2, [SP, #5, MUL VL]\n"
+    "add z25.s, z25.s, z13.s\n"
+    "mul z24.s, p2/M, z24.s, z15.s\n"
+    "st1w { z24.s }, p2, [SP, #6, MUL VL]\n"
+    "add z24.s, z24.s, z13.s\n"
+    "mul z23.s, p2/M, z23.s, z15.s\n"
+    "st1w { z23.s }, p2, [SP, #7, MUL VL]\n"
+    "add z23.s, z23.s, z13.s\n"
+    "1:"  // Loop
+    "udot z30.s, z8.b, z0.b[0]\n"
+    "ld1w { z22.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+    "incb x28\n"
+    "udot z28.s, z8.b, z0.b[2]\n"
+    "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+    "whilelt p0.s, x27, %x[n_channels]\n"
+    "udot z29.s, z8.b, z3.b[0]\n"
+    "whilelt p1.b, x28, x9\n"
+    "udot z27.s, z8.b, z3.b[2]\n"
+    "udot z26.s, z8.b, z5.b[0]\n"
+    "udot z25.s, z8.b, z5.b[2]\n"
+    "udot z24.s, z8.b, z4.b[0]\n"
+    "udot z23.s, z8.b, z4.b[2]\n"
+    "ld1b { z8.b }, p2/Z, [%x[params]]\n"
+    "udot z30.s, z9.b, z0.b[1]\n"
+    "udot z28.s, z9.b, z0.b[3]\n"
+    "udot z29.s, z9.b, z3.b[1]\n"
+    "udot z27.s, z9.b, z3.b[3]\n"
+    "udot z26.s, z9.b, z5.b[1]\n"
+    "udot z25.s, z9.b, z5.b[3]\n"
+    "udot z24.s, z9.b, z4.b[1]\n"
+    "udot z23.s, z9.b, z4.b[3]\n"
+    "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+    "udot z30.s, z10.b, z3.b[0]\n"
+    "udot z28.s, z10.b, z3.b[2]\n"
+    "udot z29.s, z10.b, z5.b[0]\n"
+    "udot z27.s, z10.b, z5.b[2]\n"
+    "udot z26.s, z10.b, z4.b[0]\n"
+    "udot z25.s, z10.b, z4.b[2]\n"
+    "udot z24.s, z10.b, z6.b[0]\n"
+    "udot z23.s, z10.b, z6.b[2]\n"
+    "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+    "udot z30.s, z11.b, z3.b[1]\n"
+    "udot z28.s, z11.b, z3.b[3]\n"
+    "udot z29.s, z11.b, z5.b[1]\n"
+    "udot z27.s, z11.b, z5.b[3]\n"
+    "udot z26.s, z11.b, z4.b[1]\n"
+    "udot z25.s, z11.b, z4.b[3]\n"
+    "udot z24.s, z11.b, z6.b[1]\n"
+    "udot z23.s, z11.b, z6.b[3]\n"
+    "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+    "udot z30.s, z8.b, z5.b[0]\n"
+    "udot z28.s, z8.b, z5.b[2]\n"
+    "udot z29.s, z8.b, z4.b[0]\n"
+    "udot z27.s, z8.b, z4.b[2]\n"
+    "udot z26.s, z8.b, z6.b[0]\n"
+    "udot z25.s, z8.b, z6.b[2]\n"
+    "udot z24.s, z8.b, z7.b[0]\n"
+    "udot z23.s, z8.b, z7.b[2]\n"
+    "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+    "udot z30.s, z9.b, z5.b[1]\n"
+    "udot z28.s, z9.b, z5.b[3]\n"
+    "udot z29.s, z9.b, z4.b[1]\n"
+    "udot z27.s, z9.b, z4.b[3]\n"
+    "udot z26.s, z9.b, z6.b[1]\n"
+    "udot z25.s, z9.b, z6.b[3]\n"
+    "udot z24.s, z9.b, z7.b[1]\n"
+    "udot z23.s, z9.b, z7.b[3]\n"
+    "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+    "addvl %x[params], %x[params], #16\n"
+    "udot z30.s, z10.b, z4.b[0]\n"
+    "ld1w { z13.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+    "udot z28.s, z10.b, z4.b[2]\n"
+    "udot z29.s, z10.b, z6.b[0]\n"
+    "udot z27.s, z10.b, z6.b[2]\n"
+    "udot z26.s, z10.b, z7.b[0]\n"
+    "udot z25.s, z10.b, z7.b[2]\n"
+    "udot z24.s, z10.b, z1.b[0]\n"
+    "udot z23.s, z10.b, z1.b[2]\n"
+    "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n"
+    "udot z30.s, z11.b, z4.b[1]\n"
+    "udot z28.s, z11.b, z4.b[3]\n"
+    "udot z29.s, z11.b, z6.b[1]\n"
+    "udot z27.s, z11.b, z6.b[3]\n"
+    "udot z26.s, z11.b, z7.b[1]\n"
+    "udot z25.s, z11.b, z7.b[3]\n"
+    "udot z24.s, z11.b, z1.b[1]\n"
+    "udot z23.s, z11.b, z1.b[3]\n"
+    "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n"
+    "udot z30.s, z8.b, z6.b[0]\n"
+    "udot z28.s, z8.b, z6.b[2]\n"
+    "udot z29.s, z8.b, z7.b[0]\n"
+    "udot z27.s, z8.b, z7.b[2]\n"
+    "udot z26.s, z8.b, z1.b[0]\n"
+    "udot z25.s, z8.b, z1.b[2]\n"
+    "udot z24.s, z8.b, z2.b[0]\n"
+    "udot z23.s, z8.b, z2.b[2]\n"
+    "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n"
+    "udot z30.s, z9.b, z6.b[1]\n"
+    "udot z28.s, z9.b, z6.b[3]\n"
+    "udot z29.s, z9.b, z7.b[1]\n"
+    "udot z27.s, z9.b, z7.b[3]\n"
+    "udot z26.s, z9.b, z1.b[1]\n"
+    "udot z25.s, z9.b, z1.b[3]\n"
+    "udot z24.s, z9.b, z2.b[1]\n"
+    "udot z23.s, z9.b, z2.b[3]\n"
+    "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n"
+    "addvl %x[params], %x[params], #-3\n"
+    ".inst 0x04b677de  // sqrdmulh z30.s, z30.s, z22.s\n"
+    ".inst 0x04b6779c  // sqrdmulh z28.s, z28.s, z22.s\n"
+    ".inst 0x04b677bd  // sqrdmulh z29.s, z29.s, z22.s\n"
+    ".inst 0x04b6777b  // sqrdmulh z27.s, z27.s, z22.s\n"
+    ".inst 0x04b6775a  // sqrdmulh z26.s, z26.s, z22.s\n"
+    "and z20.d, z30.d, z21.d\n"
+    "asr z20.s, z20.s, #0x1f\n"
+    "and z19.d, z28.d, z21.d\n"
+    "and z18.d, z29.d, z21.d\n"
+    "asr z19.s, z19.s, #0x1f\n"
+    "and z17.d, z27.d, z21.d\n"
+    "and z16.d, z26.d, z21.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    ".inst 0x04b67739  // sqrdmulh z25.s, z25.s, z22.s\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "sqadd z30.s, z30.s, z20.s\n"
+    ".inst 0x04b67718  // sqrdmulh z24.s, z24.s, z22.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b676f7  // sqrdmulh z23.s, z23.s, z22.s\n"
+    "sqadd z28.s, z28.s, z19.s\n"
+    "sqadd z29.s, z29.s, z18.s\n"
+    "and z18.d, z25.d, z21.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "sqadd z27.s, z27.s, z17.s\n"
+    "sqadd z26.s, z26.s, z16.s\n"
+    "and z17.d, z24.d, z21.d\n"
+    "asr z17.s, z17.s, #0x1f\n"
+    "and z16.d, z23.d, z21.d\n"
+    ".inst 0x44828abe  // srshl z30.s, p2/M, z30.s, z21.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z25.s, z25.s, z18.s\n"
+    ".inst 0x44828abc  // srshl z28.s, p2/M, z28.s, z21.s\n"
+    "add z30.s, z30.s, z14.s\n"
+    "sqadd z24.s, z24.s, z17.s\n"
+    ".inst 0x44828abd  // srshl z29.s, p2/M, z29.s, z21.s\n"
+    "add z28.s, z28.s, z14.s\n"
+    "sqadd z23.s, z23.s, z16.s\n"
+    "smin z30.s, p2/M, z30.s, z12.s\n"
+    "add z29.s, z29.s, z14.s\n"
+    "smin z28.s, p2/M, z28.s, z12.s\n"
+    ".inst 0x44828abb  // srshl z27.s, p2/M, z27.s, z21.s\n"
+    "smax z30.s, p2/M, z30.s, z31.s\n"
+    "st1b { z30.s }, p0, [x26, x27]\n"
+    "add z27.s, z27.s, z14.s\n"
+    "smax z28.s, p2/M, z28.s, z31.s\n"
+    "ld1w { z30.s }, p2/Z, [SP]\n"
+    "smin z29.s, p2/M, z29.s, z12.s\n"
+    "st1b { z28.s }, p0, [x25, x27]\n"
+    "add z30.s, z30.s, z13.s\n"
+    "smin z27.s, p2/M, z27.s, z12.s\n"
+    "ld1w { z28.s }, p2/Z, [SP, #1, MUL VL]\n"
+    "smax z29.s, p2/M, z29.s, z31.s\n"
+    "st1b { z29.s }, p0, [x24, x27]\n"
+    "add z28.s, z28.s, z13.s\n"
+    "smax z27.s, p2/M, z27.s, z31.s\n"
+    "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n"
+    ".inst 0x44828aba  // srshl z26.s, p2/M, z26.s, z21.s\n"
+    "st1b { z27.s }, p0, [x23, x27]\n"
+    "add z29.s, z29.s, z13.s\n"
+    ".inst 0x44828ab9  // srshl z25.s, p2/M, z25.s, z21.s\n"
+    "ld1w { z27.s }, p2/Z, [SP, #3, MUL VL]\n"
+    "add z26.s, z26.s, z14.s\n"
+    ".inst 0x44828ab8  // srshl z24.s, p2/M, z24.s, z21.s\n"
+    ".inst 0x44828ab7  // srshl z23.s, p2/M, z23.s, z21.s\n"
+    "add z25.s, z25.s, z14.s\n"
+    "add z27.s, z27.s, z13.s\n"
+    "add z24.s, z24.s, z14.s\n"
+    "add z23.s, z23.s, z14.s\n"
+    "smin z26.s, p2/M, z26.s, z12.s\n"
+    "smin z25.s, p2/M, z25.s, z12.s\n"
+    "smin z24.s, p2/M, z24.s, z12.s\n"
+    "smin z23.s, p2/M, z23.s, z12.s\n"
+    "smax z26.s, p2/M, z26.s, z31.s\n"
+    "st1b { z26.s }, p0, [x22, x27]\n"
+    "smax z25.s, p2/M, z25.s, z31.s\n"
+    "smax z24.s, p2/M, z24.s, z31.s\n"
+    "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n"
+    "smax z23.s, p2/M, z23.s, z31.s\n"
+    "st1b { z25.s }, p0, [x21, x27]\n"
+    "add z26.s, z26.s, z13.s\n"
+    "st1b { z24.s }, p0, [x20, x27]\n"
+    "st1b { z23.s }, p0, [x19, x27]\n"
+    "incw x27\n"
+    "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n"
+    "add z25.s, z25.s, z13.s\n"
+    "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n"
+    "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n"
+    "add z24.s, z24.s, z13.s\n"
+    "add z23.s, z23.s, z13.s\n"
+    "b.any 1b\n"
+    "addvl SP, SP, #8\n"
+    : [params] "+&r" (params)
+    : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+    : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__ARM_FEATURE_SVE)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..361f48b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef int8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 4;
+  constexpr static unsigned int input_cols = 4;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+
+  sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..4fc8999
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[16];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[5];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[3];
+      inptrs[3] = inptrs_raw[6];
+      inptrs[4] = inptrs_raw[9];
+      inptrs[5] = inptrs_raw[12];
+      inptrs[6] = inptrs_raw[15];
+      inptrs[7] = inptrs_raw[1];
+      inptrs[8] = inptrs_raw[2];
+      inptrs[9] = inptrs_raw[10];
+      inptrs[10] = inptrs_raw[4];
+      inptrs[11] = inptrs_raw[7];
+      inptrs[12] = inptrs_raw[8];
+      inptrs[13] = inptrs_raw[11];
+      inptrs[14] = inptrs_raw[13];
+      inptrs[15] = inptrs_raw[14];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x16, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x15, #0x0\n"
+    "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x13, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z11.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z26.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z12.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z14.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x16, x8\n"
+    "ld1rw { z17.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x16, x8\n"
+    "ldp x11, x10, [x21, #0x0]\n"
+    "mov x19, x16\n"
+    "incw x19\n"
+    "ldp x9, x28, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x8\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z10.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z13.s, z10.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z15.s, z10.s, z16.s\n"
+    "mov z25.d, z13.d\n"
+    "ld1sb { z0.h }, p4/Z, [x17]\n"
+    "mov z23.d, z13.d\n"
+    "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "mov z9.d, z15.d\n"
+    "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "mov z22.d, z15.d\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
+    "mov z10.d, z13.d\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "mov z24.d, z15.d\n"
+    "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x455a1000  // ssublb z0.h, z0.b, z26.b\n"
+    "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+    ".inst 0x455a1021  // ssublb z1.h, z1.b, z26.b\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x455a1042  // ssublb z2.h, z2.b, z26.b\n"
+    "ld1sb { z8.h }, p4/Z, [x17]\n"
+    ".inst 0x455a1063  // ssublb z3.h, z3.b, z26.b\n"
+    "ldp x23, x22, [x13, #0x0]\n"
+    ".inst 0x455a1084  // ssublb z4.h, z4.b, z26.b\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    ".inst 0x455a10a5  // ssublb z5.h, z5.b, z26.b\n"
+    ".inst 0x455a10c6  // ssublb z6.h, z6.b, z26.b\n"
+    "ldr x19, [x13, #0x20]\n"
+    ".inst 0x455a10e7  // ssublb z7.h, z7.b, z26.b\n"
+    ".inst 0x455a1108  // ssublb z8.h, z8.b, z26.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x16]\n"
+    "ld1b { z30.h }, p3/Z, [x22, x16]\n"
+    ".inst 0x454b1bff  // usublb z31.h, z31.b, z11.b\n"
+    "ld1b { z29.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454b1bde  // usublb z30.h, z30.b, z11.b\n"
+    "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+    "ld1b { z27.h }, p3/Z, [x19, x16]\n"
+    ".inst 0x454b1bbd  // usublb z29.h, z29.b, z11.b\n"
+    ".inst 0x454b1b9c  // usublb z28.h, z28.b, z11.b\n"
+    ".inst 0x454b1b7b  // usublb z27.h, z27.b, z11.b\n"
+    "1:"  // Loop
+    ".inst 0x448443ed  // smlalb z13.s, p4/M, z31.h, z4.h\n"
+    "ldr x20, [x13, #0x28]\n"
+    "whilelt p0.h, x15, x8\n"
+    ".inst 0x448447ef  // smlalt z15.s, p4/M, z31.h, z4.h\n"
+    "ldr x27, [x13, #0x30]\n"
+    "inch x17\n"
+    ".inst 0x448343f9  // smlalb z25.s, p4/M, z31.h, z3.h\n"
+    "ldr x26, [x13, #0x38]\n"
+    ".inst 0x448347e9  // smlalt z9.s, p4/M, z31.h, z3.h\n"
+    "ldr x25, [x13, #0x40]\n"
+    ".inst 0x448143f7  // smlalb z23.s, p4/M, z31.h, z1.h\n"
+    "ldr x19, [x13, #0x48]\n"
+    ".inst 0x448147f6  // smlalt z22.s, p4/M, z31.h, z1.h\n"
+    "ldr x24, [x13, #0x50]\n"
+    ".inst 0x448043ea  // smlalb z10.s, p4/M, z31.h, z0.h\n"
+    "ldr x23, [x13, #0x58]\n"
+    ".inst 0x448047f8  // smlalt z24.s, p4/M, z31.h, z0.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x448043cd  // smlalb z13.s, p4/M, z30.h, z0.h\n"
+    "ldr x22, [x13, #0x60]\n"
+    ".inst 0x448047cf  // smlalt z15.s, p4/M, z30.h, z0.h\n"
+    "ld1b { z30.h }, p3/Z, [x19, x16]\n"
+    ".inst 0x448243b9  // smlalb z25.s, p4/M, z29.h, z2.h\n"
+    "ldr x21, [x13, #0x68]\n"
+    ".inst 0x454b1bff  // usublb z31.h, z31.b, z11.b\n"
+    "ldr x20, [x13, #0x70]\n"
+    ".inst 0x448247a9  // smlalt z9.s, p4/M, z29.h, z2.h\n"
+    "ld1b { z29.h }, p3/Z, [x27, x16]\n"
+    ".inst 0x454b1bde  // usublb z30.h, z30.b, z11.b\n"
+    "ldr x19, [x13, #0x78]\n"
+    ".inst 0x4485438d  // smlalb z13.s, p4/M, z28.h, z5.h\n"
+    "ld1w { z19.s }, p2/Z, [x14]\n"
+    ".inst 0x4485478f  // smlalt z15.s, p4/M, z28.h, z5.h\n"
+    "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n"
+    "addvl x14, x14, #2\n"
+    ".inst 0x454b1bbd  // usublb z29.h, z29.b, z11.b\n"
+    ".inst 0x44844399  // smlalb z25.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844789  // smlalt z9.s, p4/M, z28.h, z4.h\n"
+    "uzp1 z21.s, z19.s, z16.s\n"
+    "uzp2 z18.s, z19.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x12]\n"
+    ".inst 0x44824397  // smlalb z23.s, p4/M, z28.h, z2.h\n"
+    "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+    "addvl x12, x12, #2\n"
+    ".inst 0x44824796  // smlalt z22.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x4481438a  // smlalb z10.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814798  // smlalt z24.s, p4/M, z28.h, z1.h\n"
+    "ld1b { z28.h }, p3/Z, [x26, x16]\n"
+    "uzp1 z20.s, z19.s, z16.s\n"
+    "uzp2 z19.s, z19.s, z16.s\n"
+    ".inst 0x448643f7  // smlalb z23.s, p4/M, z31.h, z6.h\n"
+    ".inst 0x454b1b9c  // usublb z28.h, z28.b, z11.b\n"
+    ".inst 0x448647f6  // smlalt z22.s, p4/M, z31.h, z6.h\n"
+    "ld1b { z31.h }, p3/Z, [x25, x16]\n"
+    ".inst 0x4487436d  // smlalb z13.s, p4/M, z27.h, z7.h\n"
+    ".inst 0x4487476f  // smlalt z15.s, p4/M, z27.h, z7.h\n"
+    ".inst 0x44864379  // smlalb z25.s, p4/M, z27.h, z6.h\n"
+    ".inst 0x454b1bff  // usublb z31.h, z31.b, z11.b\n"
+    ".inst 0x44864769  // smlalt z9.s, p4/M, z27.h, z6.h\n"
+    ".inst 0x44844377  // smlalb z23.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x44844776  // smlalt z22.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x4483436a  // smlalb z10.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834778  // smlalt z24.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x4481438d  // smlalb z13.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x4481478f  // smlalt z15.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x448843aa  // smlalb z10.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x448847b8  // smlalt z24.s, p4/M, z29.h, z8.h\n"
+    "ld1b { z29.h }, p3/Z, [x24, x16]\n"
+    ".inst 0x44804399  // smlalb z25.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x44804789  // smlalt z9.s, p4/M, z28.h, z0.h\n"
+    "ld1b { z28.h }, p3/Z, [x23, x16]\n"
+    ".inst 0x448243ed  // smlalb z13.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x454b1bbd  // usublb z29.h, z29.b, z11.b\n"
+    ".inst 0x448247ef  // smlalt z15.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x454b1b9c  // usublb z28.h, z28.b, z11.b\n"
+    ".inst 0x448143f9  // smlalb z25.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147e9  // smlalt z9.s, p4/M, z31.h, z1.h\n"
+    "ld1b { z31.h }, p3/Z, [x22, x16]\n"
+    ".inst 0x448843cd  // smlalb z13.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x448847cf  // smlalt z15.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x448743d9  // smlalb z25.s, p4/M, z30.h, z7.h\n"
+    ".inst 0x454b1bff  // usublb z31.h, z31.b, z11.b\n"
+    ".inst 0x448747c9  // smlalt z9.s, p4/M, z30.h, z7.h\n"
+    ".inst 0x448543d7  // smlalb z23.s, p4/M, z30.h, z5.h\n"
+    ".inst 0x448547d6  // smlalt z22.s, p4/M, z30.h, z5.h\n"
+    ".inst 0x448443ca  // smlalb z10.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x448447d8  // smlalt z24.s, p4/M, z30.h, z4.h\n"
+    "ld1b { z30.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x448343ad  // smlalb z13.s, p4/M, z29.h, z3.h\n"
+    ".inst 0x448347af  // smlalt z15.s, p4/M, z29.h, z3.h\n"
+    ".inst 0x448043b7  // smlalb z23.s, p4/M, z29.h, z0.h\n"
+    ".inst 0x454b1bde  // usublb z30.h, z30.b, z11.b\n"
+    ".inst 0x448047b6  // smlalt z22.s, p4/M, z29.h, z0.h\n"
+    "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+    ".inst 0x44854399  // smlalb z25.s, p4/M, z28.h, z5.h\n"
+    ".inst 0x44854789  // smlalt z9.s, p4/M, z28.h, z5.h\n"
+    ".inst 0x4482438a  // smlalb z10.s, p4/M, z28.h, z2.h\n"
+    ".inst 0x454b1bbd  // usublb z29.h, z29.b, z11.b\n"
+    ".inst 0x44824798  // smlalt z24.s, p4/M, z28.h, z2.h\n"
+    "ld1b { z28.h }, p3/Z, [x19, x16]\n"
+    "inch x16\n"
+    ".inst 0x448643ed  // smlalb z13.s, p4/M, z31.h, z6.h\n"
+    "whilelt p2.s, x16, x8\n"
+    ".inst 0x448647ef  // smlalt z15.s, p4/M, z31.h, z6.h\n"
+    "mov x19, x16\n"
+    ".inst 0x448343f7  // smlalb z23.s, p4/M, z31.h, z3.h\n"
+    "incw x19\n"
+    ".inst 0x454b1b9c  // usublb z28.h, z28.b, z11.b\n"
+    "whilelt p1.s, x19, x8\n"
+    ".inst 0x448347f6  // smlalt z22.s, p4/M, z31.h, z3.h\n"
+    "whilelt p3.h, x16, x8\n"
+    ".inst 0x04b575ad  // sqrdmulh z13.s, z13.s, z21.s\n"
+    ".inst 0x04b275ef  // sqrdmulh z15.s, z15.s, z18.s\n"
+    ".inst 0x448843d9  // smlalb z25.s, p4/M, z30.h, z8.h\n"
+    ".inst 0x448847c9  // smlalt z9.s, p4/M, z30.h, z8.h\n"
+    "and z4.d, z13.d, z20.d\n"
+    "and z16.d, z15.d, z19.d\n"
+    ".inst 0x04b57739  // sqrdmulh z25.s, z25.s, z21.s\n"
+    "asr z4.s, z4.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x04b27529  // sqrdmulh z9.s, z9.s, z18.s\n"
+    "sqadd z13.s, z13.s, z4.s\n"
+    "sqadd z15.s, z15.s, z16.s\n"
+    "and z2.d, z25.d, z20.d\n"
+    "and z16.d, z9.d, z19.d\n"
+    ".inst 0x448543ca  // smlalb z10.s, p4/M, z30.h, z5.h\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x448547d8  // smlalt z24.s, p4/M, z30.h, z5.h\n"
+    "sqadd z25.s, z25.s, z2.s\n"
+    "sqadd z9.s, z9.s, z16.s\n"
+    ".inst 0x448743b7  // smlalb z23.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x448747b6  // smlalt z22.s, p4/M, z29.h, z7.h\n"
+    ".inst 0x448643aa  // smlalb z10.s, p4/M, z29.h, z6.h\n"
+    ".inst 0x448647b8  // smlalt z24.s, p4/M, z29.h, z6.h\n"
+    ".inst 0x44884397  // smlalb z23.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x44884796  // smlalt z22.s, p4/M, z28.h, z8.h\n"
+    ".inst 0x4487438a  // smlalb z10.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x44874798  // smlalt z24.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x04b576f7  // sqrdmulh z23.s, z23.s, z21.s\n"
+    ".inst 0x04b276d6  // sqrdmulh z22.s, z22.s, z18.s\n"
+    ".inst 0x04b5754a  // sqrdmulh z10.s, z10.s, z21.s\n"
+    ".inst 0x04b27718  // sqrdmulh z24.s, z24.s, z18.s\n"
+    "and z18.d, z23.d, z20.d\n"
+    "and z0.d, z22.d, z19.d\n"
+    "and z16.d, z10.d, z20.d\n"
+    "asr z18.s, z18.s, #0x1f\n"
+    "asr z0.s, z0.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z23.s, z23.s, z18.s\n"
+    "sqadd z22.s, z22.s, z0.s\n"
+    "sqadd z10.s, z10.s, z16.s\n"
+    "and z16.d, z24.d, z19.d\n"
+    ".inst 0x4482928d  // srshl z13.s, p4/M, z13.s, z20.s\n"
+    ".inst 0x4482926f  // srshl z15.s, p4/M, z15.s, z19.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44829299  // srshl z25.s, p4/M, z25.s, z20.s\n"
+    "add z13.s, z13.s, z12.s\n"
+    "add z15.s, z15.s, z12.s\n"
+    "sqadd z24.s, z24.s, z16.s\n"
+    "add z25.s, z25.s, z12.s\n"
+    "smin z13.s, p4/M, z13.s, z17.s\n"
+    "smin z15.s, p4/M, z15.s, z17.s\n"
+    "smin z25.s, p4/M, z25.s, z17.s\n"
+    ".inst 0x44829269  // srshl z9.s, p4/M, z9.s, z19.s\n"
+    "smax z13.s, p4/M, z13.s, z14.s\n"
+    "smax z15.s, p4/M, z15.s, z14.s\n"
+    "smax z25.s, p4/M, z25.s, z14.s\n"
+    "add z9.s, z9.s, z12.s\n"
+    ".inst 0x44829297  // srshl z23.s, p4/M, z23.s, z20.s\n"
+    "trn1 z13.h, z13.h, z15.h\n"
+    "st1b { z13.h }, p0, [x11, x15]\n"
+    "smin z9.s, p4/M, z9.s, z17.s\n"
+    ".inst 0x44829276  // srshl z22.s, p4/M, z22.s, z19.s\n"
+    "add z23.s, z23.s, z12.s\n"
+    ".inst 0x4482928a  // srshl z10.s, p4/M, z10.s, z20.s\n"
+    ".inst 0x44829278  // srshl z24.s, p4/M, z24.s, z19.s\n"
+    "add z22.s, z22.s, z12.s\n"
+    "smax z9.s, p4/M, z9.s, z14.s\n"
+    "add z10.s, z10.s, z12.s\n"
+    "add z24.s, z24.s, z12.s\n"
+    "smin z23.s, p4/M, z23.s, z17.s\n"
+    "trn1 z25.h, z25.h, z9.h\n"
+    "st1b { z25.h }, p0, [x10, x15]\n"
+    "smin z22.s, p4/M, z22.s, z17.s\n"
+    "smin z10.s, p4/M, z10.s, z17.s\n"
+    "smax z23.s, p4/M, z23.s, z14.s\n"
+    "smin z24.s, p4/M, z24.s, z17.s\n"
+    "smax z22.s, p4/M, z22.s, z14.s\n"
+    "smax z10.s, p4/M, z10.s, z14.s\n"
+    "smax z24.s, p4/M, z24.s, z14.s\n"
+    "trn1 z23.h, z23.h, z22.h\n"
+    "st1b { z23.h }, p0, [x9, x15]\n"
+    "trn1 z10.h, z10.h, z24.h\n"
+    "st1b { z10.h }, p0, [x28, x15]\n"
+    "inch x15\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z10.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z13.s, z10.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z15.s, z10.s, z16.s\n"
+    "mov z25.d, z13.d\n"
+    "ld1sb { z0.h }, p4/Z, [x17]\n"
+    "mov z23.d, z13.d\n"
+    "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n"
+    "mov z9.d, z15.d\n"
+    "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n"
+    "mov z22.d, z15.d\n"
+    "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n"
+    "mov z10.d, z13.d\n"
+    "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n"
+    "mov z24.d, z15.d\n"
+    "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n"
+    ".inst 0x455a1000  // ssublb z0.h, z0.b, z26.b\n"
+    "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n"
+    ".inst 0x455a1021  // ssublb z1.h, z1.b, z26.b\n"
+    "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+    "inch x17, ALL, MUL #8\n"
+    ".inst 0x455a1042  // ssublb z2.h, z2.b, z26.b\n"
+    "ld1sb { z8.h }, p4/Z, [x17]\n"
+    ".inst 0x455a1063  // ssublb z3.h, z3.b, z26.b\n"
+    "ldp x23, x22, [x13, #0x0]\n"
+    ".inst 0x455a1084  // ssublb z4.h, z4.b, z26.b\n"
+    "ldp x21, x20, [x13, #0x10]\n"
+    ".inst 0x455a10a5  // ssublb z5.h, z5.b, z26.b\n"
+    ".inst 0x455a10c6  // ssublb z6.h, z6.b, z26.b\n"
+    "ldr x19, [x13, #0x20]\n"
+    ".inst 0x455a10e7  // ssublb z7.h, z7.b, z26.b\n"
+    ".inst 0x455a1108  // ssublb z8.h, z8.b, z26.b\n"
+    "ld1b { z31.h }, p3/Z, [x23, x16]\n"
+    "ld1b { z30.h }, p3/Z, [x22, x16]\n"
+    ".inst 0x454b1bff  // usublb z31.h, z31.b, z11.b\n"
+    "ld1b { z29.h }, p3/Z, [x21, x16]\n"
+    ".inst 0x454b1bde  // usublb z30.h, z30.b, z11.b\n"
+    "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+    "ld1b { z27.h }, p3/Z, [x19, x16]\n"
+    ".inst 0x454b1bbd  // usublb z29.h, z29.b, z11.b\n"
+    ".inst 0x454b1b9c  // usublb z28.h, z28.b, z11.b\n"
+    ".inst 0x454b1b7b  // usublb z27.h, z27.b, z11.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..dc33a3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef int8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 3;
+  constexpr static unsigned int kernel_cols = 3;
+
+  constexpr static unsigned int stride_rows = 2;
+  constexpr static unsigned int stride_cols = 2;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 5;
+  constexpr static unsigned int input_cols = 5;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+
+  kern_type kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+
+  sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..63960f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[25];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[12];
+      inptrs[1] = inptrs_raw[0];
+      inptrs[2] = inptrs_raw[1];
+      inptrs[3] = inptrs_raw[3];
+      inptrs[4] = inptrs_raw[4];
+      inptrs[5] = inptrs_raw[5];
+      inptrs[6] = inptrs_raw[6];
+      inptrs[7] = inptrs_raw[2];
+      inptrs[8] = inptrs_raw[8];
+      inptrs[9] = inptrs_raw[9];
+      inptrs[10] = inptrs_raw[7];
+      inptrs[11] = inptrs_raw[15];
+      inptrs[12] = inptrs_raw[10];
+      inptrs[13] = inptrs_raw[16];
+      inptrs[14] = inptrs_raw[11];
+      inptrs[15] = inptrs_raw[18];
+      inptrs[16] = inptrs_raw[13];
+      inptrs[17] = inptrs_raw[19];
+      inptrs[18] = inptrs_raw[20];
+      inptrs[19] = inptrs_raw[14];
+      inptrs[20] = inptrs_raw[21];
+      inptrs[21] = inptrs_raw[17];
+      inptrs[22] = inptrs_raw[23];
+      inptrs[23] = inptrs_raw[22];
+      inptrs[24] = inptrs_raw[24];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x6, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x7, #0x0\n"
+    "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z16.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z12.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z14.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z17.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x6, x4\n"
+    "ld1rw { z15.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x6, x4\n"
+    "ldp x15, x14, [x21, #0x0]\n"
+    "mov x19, x6\n"
+    "incw x19\n"
+    "ldp x13, x12, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x4\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z20.s }, p2/Z, [x19]\n"
+    "ld1w { z10.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z13.s, z20.s, z10.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z20.s, z20.s, z10.s\n"
+    "mov z11.d, z13.d\n"
+    "ld1sb { z0.h }, p4/Z, [x5]\n"
+    "mov z9.d, z13.d\n"
+    "ld1sb { z1.h }, p4/Z, [x5, #1, MUL VL]\n"
+    "mov z18.d, z20.d\n"
+    "ld1sb { z2.h }, p4/Z, [x5, #2, MUL VL]\n"
+    "mov z19.d, z20.d\n"
+    "ld1sb { z3.h }, p4/Z, [x5, #3, MUL VL]\n"
+    "mov z23.d, z13.d\n"
+    "ld1sb { z4.h }, p4/Z, [x5, #4, MUL VL]\n"
+    "mov z21.d, z20.d\n"
+    "ld1sb { z5.h }, p4/Z, [x5, #5, MUL VL]\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    "ld1sb { z6.h }, p4/Z, [x5, #6, MUL VL]\n"
+    ".inst 0x454c1021  // ssublb z1.h, z1.b, z12.b\n"
+    "ld1sb { z7.h }, p4/Z, [x5, #7, MUL VL]\n"
+    "inch x5, ALL, MUL #8\n"
+    ".inst 0x454c1042  // ssublb z2.h, z2.b, z12.b\n"
+    "ld1sb { z8.h }, p4/Z, [x5]\n"
+    ".inst 0x454c1063  // ssublb z3.h, z3.b, z12.b\n"
+    "ldp x26, x25, [x17, #0x0]\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    "ldp x24, x23, [x17, #0x10]\n"
+    ".inst 0x454c10a5  // ssublb z5.h, z5.b, z12.b\n"
+    ".inst 0x454c10c6  // ssublb z6.h, z6.b, z12.b\n"
+    "ldp x22, x21, [x17, #0x20]\n"
+    ".inst 0x454c10e7  // ssublb z7.h, z7.b, z12.b\n"
+    ".inst 0x454c1108  // ssublb z8.h, z8.b, z12.b\n"
+    "ldp x20, x19, [x17, #0x30]\n"
+    "ld1b { z31.h }, p3/Z, [x26, x6]\n"
+    ".inst 0x45501bff  // usublb z31.h, z31.b, z16.b\n"
+    "ld1b { z30.h }, p3/Z, [x25, x6]\n"
+    "ld1b { z29.h }, p3/Z, [x24, x6]\n"
+    ".inst 0x45501bde  // usublb z30.h, z30.b, z16.b\n"
+    "ld1b { z28.h }, p3/Z, [x23, x6]\n"
+    ".inst 0x45501bbd  // usublb z29.h, z29.b, z16.b\n"
+    "ld1b { z27.h }, p3/Z, [x22, x6]\n"
+    "ld1b { z26.h }, p3/Z, [x21, x6]\n"
+    ".inst 0x45501b9c  // usublb z28.h, z28.b, z16.b\n"
+    "ld1b { z25.h }, p3/Z, [x20, x6]\n"
+    "ld1b { z24.h }, p3/Z, [x19, x6]\n"
+    ".inst 0x45501b7b  // usublb z27.h, z27.b, z16.b\n"
+    ".inst 0x45501b5a  // usublb z26.h, z26.b, z16.b\n"
+    ".inst 0x45501b39  // usublb z25.h, z25.b, z16.b\n"
+    ".inst 0x45501b18  // usublb z24.h, z24.b, z16.b\n"
+    "1:"  // Loop
+    ".inst 0x448843ed  // smlalb z13.s, p4/M, z31.h, z8.h\n"
+    "ldr x22, [x17, #0x40]\n"
+    "whilelt p0.h, x7, x4\n"
+    ".inst 0x448847f4  // smlalt z20.s, p4/M, z31.h, z8.h\n"
+    "ldr x21, [x17, #0x48]\n"
+    "inch x5\n"
+    ".inst 0x448643eb  // smlalb z11.s, p4/M, z31.h, z6.h\n"
+    "ldr x20, [x17, #0x50]\n"
+    ".inst 0x448647f2  // smlalt z18.s, p4/M, z31.h, z6.h\n"
+    "ldr x19, [x17, #0x58]\n"
+    ".inst 0x448243e9  // smlalb z9.s, p4/M, z31.h, z2.h\n"
+    "ldr x11, [x17, #0x60]\n"
+    ".inst 0x448247f3  // smlalt z19.s, p4/M, z31.h, z2.h\n"
+    "ldr x10, [x17, #0x68]\n"
+    ".inst 0x448043f7  // smlalb z23.s, p4/M, z31.h, z0.h\n"
+    "ldr x9, [x17, #0x70]\n"
+    ".inst 0x448047f5  // smlalt z21.s, p4/M, z31.h, z0.h\n"
+    "ldr x28, [x17, #0x78]\n"
+    ".inst 0x448043cd  // smlalb z13.s, p4/M, z30.h, z0.h\n"
+    "ldr x27, [x17, #0x80]\n"
+    ".inst 0x448047d4  // smlalt z20.s, p4/M, z30.h, z0.h\n"
+    "ldr x26, [x17, #0x88]\n"
+    ".inst 0x4481438b  // smlalb z11.s, p4/M, z28.h, z1.h\n"
+    "ldr x25, [x17, #0x90]\n"
+    ".inst 0x44814792  // smlalt z18.s, p4/M, z28.h, z1.h\n"
+    "ld1b { z28.h }, p3/Z, [x21, x6]\n"
+    ".inst 0x448143ad  // smlalb z13.s, p4/M, z29.h, z1.h\n"
+    "ldr x24, [x17, #0x98]\n"
+    ".inst 0x448147b4  // smlalt z20.s, p4/M, z29.h, z1.h\n"
+    "ld1b { z29.h }, p3/Z, [x22, x6]\n"
+    ".inst 0x4482436b  // smlalb z11.s, p4/M, z27.h, z2.h\n"
+    "ldr x23, [x17, #0xa0]\n"
+    ".inst 0x45501b9c  // usublb z28.h, z28.b, z16.b\n"
+    "ldr x22, [x17, #0xa8]\n"
+    ".inst 0x44824772  // smlalt z18.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z27.h }, p3/Z, [x20, x6]\n"
+    ".inst 0x45501bbd  // usublb z29.h, z29.b, z16.b\n"
+    "ldr x21, [x17, #0xb0]\n"
+    ".inst 0x4483434d  // smlalb z13.s, p4/M, z26.h, z3.h\n"
+    "ldr x20, [x17, #0xb8]\n"
+    ".inst 0x44834754  // smlalt z20.s, p4/M, z26.h, z3.h\n"
+    "ld1b { z26.h }, p3/Z, [x19, x6]\n"
+    ".inst 0x45501b7b  // usublb z27.h, z27.b, z16.b\n"
+    "ldr x19, [x17, #0xc0]\n"
+    ".inst 0x4480430b  // smlalb z11.s, p4/M, z24.h, z0.h\n"
+    "ld1w { z10.s }, p2/Z, [x8]\n"
+    ".inst 0x4484432d  // smlalb z13.s, p4/M, z25.h, z4.h\n"
+    "ld1w { z22.s }, p1/Z, [x8, #1, MUL VL]\n"
+    "addvl x8, x8, #2\n"
+    ".inst 0x45501b5a  // usublb z26.h, z26.b, z16.b\n"
+    ".inst 0x44844734  // smlalt z20.s, p4/M, z25.h, z4.h\n"
+    "ld1b { z25.h }, p3/Z, [x11, x6]\n"
+    ".inst 0x44804712  // smlalt z18.s, p4/M, z24.h, z0.h\n"
+    "uzp1 z31.s, z10.s, z22.s\n"
+    "uzp2 z30.s, z10.s, z22.s\n"
+    "ld1w { z10.s }, p2/Z, [x16]\n"
+    ".inst 0x45501b39  // usublb z25.h, z25.b, z16.b\n"
+    "ld1w { z22.s }, p1/Z, [x16, #1, MUL VL]\n"
+    "addvl x16, x16, #2\n"
+    ".inst 0x4482430d  // smlalb z13.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x44824714  // smlalt z20.s, p4/M, z24.h, z2.h\n"
+    "ld1b { z24.h }, p3/Z, [x9, x6]\n"
+    ".inst 0x448443ab  // smlalb z11.s, p4/M, z29.h, z4.h\n"
+    ".inst 0x448447b2  // smlalt z18.s, p4/M, z29.h, z4.h\n"
+    "ld1b { z29.h }, p3/Z, [x10, x6]\n"
+    ".inst 0x44834349  // smlalb z9.s, p4/M, z26.h, z3.h\n"
+    ".inst 0x45501b18  // usublb z24.h, z24.b, z16.b\n"
+    ".inst 0x4485438b  // smlalb z11.s, p4/M, z28.h, z5.h\n"
+    ".inst 0x45501bbd  // usublb z29.h, z29.b, z16.b\n"
+    ".inst 0x44854792  // smlalt z18.s, p4/M, z28.h, z5.h\n"
+    "ld1b { z28.h }, p3/Z, [x27, x6]\n"
+    ".inst 0x4485436d  // smlalb z13.s, p4/M, z27.h, z5.h\n"
+    ".inst 0x44854774  // smlalt z20.s, p4/M, z27.h, z5.h\n"
+    ".inst 0x4483436b  // smlalb z11.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x45501b9c  // usublb z28.h, z28.b, z16.b\n"
+    ".inst 0x44834772  // smlalt z18.s, p4/M, z27.h, z3.h\n"
+    "ld1b { z27.h }, p3/Z, [x28, x6]\n"
+    ".inst 0x44834753  // smlalt z19.s, p4/M, z26.h, z3.h\n"
+    "ld1b { z26.h }, p3/Z, [x26, x6]\n"
+    ".inst 0x4486432d  // smlalb z13.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x44864734  // smlalt z20.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x45501b7b  // usublb z27.h, z27.b, z16.b\n"
+    ".inst 0x45501b5a  // usublb z26.h, z26.b, z16.b\n"
+    ".inst 0x44804329  // smlalb z9.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804733  // smlalt z19.s, p4/M, z25.h, z0.h\n"
+    "ld1b { z25.h }, p3/Z, [x25, x6]\n"
+    "uzp1 z0.s, z10.s, z22.s\n"
+    "uzp2 z22.s, z10.s, z22.s\n"
+    ".inst 0x448443a9  // smlalb z9.s, p4/M, z29.h, z4.h\n"
+    ".inst 0x45501b39  // usublb z25.h, z25.b, z16.b\n"
+    ".inst 0x448447b3  // smlalt z19.s, p4/M, z29.h, z4.h\n"
+    "ld1b { z29.h }, p3/Z, [x24, x6]\n"
+    ".inst 0x4487430d  // smlalb z13.s, p4/M, z24.h, z7.h\n"
+    ".inst 0x44874714  // smlalt z20.s, p4/M, z24.h, z7.h\n"
+    ".inst 0x44814309  // smlalb z9.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x45501bbd  // usublb z29.h, z29.b, z16.b\n"
+    ".inst 0x04bf75ad  // sqrdmulh z13.s, z13.s, z31.s\n"
+    ".inst 0x04be7694  // sqrdmulh z20.s, z20.s, z30.s\n"
+    ".inst 0x44814713  // smlalt z19.s, p4/M, z24.h, z1.h\n"
+    "ld1b { z24.h }, p3/Z, [x22, x6]\n"
+    ".inst 0x44844377  // smlalb z23.s, p4/M, z27.h, z4.h\n"
+    "and z10.d, z13.d, z0.d\n"
+    ".inst 0x44844775  // smlalt z21.s, p4/M, z27.h, z4.h\n"
+    "ld1b { z27.h }, p3/Z, [x23, x6]\n"
+    ".inst 0x45501b18  // usublb z24.h, z24.b, z16.b\n"
+    "asr z10.s, z10.s, #0x1f\n"
+    "and z4.d, z20.d, z22.d\n"
+    ".inst 0x45501b7b  // usublb z27.h, z27.b, z16.b\n"
+    "sqadd z13.s, z13.s, z10.s\n"
+    "asr z4.s, z4.s, #0x1f\n"
+    ".inst 0x4487438b  // smlalb z11.s, p4/M, z28.h, z7.h\n"
+    ".inst 0x44874792  // smlalt z18.s, p4/M, z28.h, z7.h\n"
+    "sqadd z20.s, z20.s, z4.s\n"
+    ".inst 0x44814397  // smlalb z23.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814795  // smlalt z21.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44864329  // smlalb z9.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x44864733  // smlalt z19.s, p4/M, z25.h, z6.h\n"
+    "ld1b { z25.h }, p3/Z, [x20, x6]\n"
+    ".inst 0x44854357  // smlalb z23.s, p4/M, z26.h, z5.h\n"
+    ".inst 0x44854755  // smlalt z21.s, p4/M, z26.h, z5.h\n"
+    "ld1b { z26.h }, p3/Z, [x21, x6]\n"
+    ".inst 0x448843ab  // smlalb z11.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x45501b39  // usublb z25.h, z25.b, z16.b\n"
+    ".inst 0x448847b2  // smlalt z18.s, p4/M, z29.h, z8.h\n"
+    ".inst 0x45501b5a  // usublb z26.h, z26.b, z16.b\n"
+    ".inst 0x04bf756b  // sqrdmulh z11.s, z11.s, z31.s\n"
+    ".inst 0x448243b7  // smlalb z23.s, p4/M, z29.h, z2.h\n"
+    ".inst 0x04be7652  // sqrdmulh z18.s, z18.s, z30.s\n"
+    ".inst 0x448247b5  // smlalt z21.s, p4/M, z29.h, z2.h\n"
+    "ld1b { z29.h }, p3/Z, [x19, x6]\n"
+    "inch x6\n"
+    "and z2.d, z11.d, z0.d\n"
+    "whilelt p2.s, x6, x4\n"
+    ".inst 0x44874369  // smlalb z9.s, p4/M, z27.h, z7.h\n"
+    "mov x19, x6\n"
+    "and z10.d, z18.d, z22.d\n"
+    "incw x19\n"
+    ".inst 0x45501bbd  // usublb z29.h, z29.b, z16.b\n"
+    "whilelt p1.s, x19, x4\n"
+    "asr z2.s, z2.s, #0x1f\n"
+    "whilelt p3.h, x6, x4\n"
+    "asr z10.s, z10.s, #0x1f\n"
+    ".inst 0x44874773  // smlalt z19.s, p4/M, z27.h, z7.h\n"
+    "sqadd z11.s, z11.s, z2.s\n"
+    "sqadd z18.s, z18.s, z10.s\n"
+    ".inst 0x44854309  // smlalb z9.s, p4/M, z24.h, z5.h\n"
+    ".inst 0x44854713  // smlalt z19.s, p4/M, z24.h, z5.h\n"
+    ".inst 0x44834317  // smlalb z23.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834715  // smlalt z21.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44884329  // smlalb z9.s, p4/M, z25.h, z8.h\n"
+    ".inst 0x44884733  // smlalt z19.s, p4/M, z25.h, z8.h\n"
+    ".inst 0x44874357  // smlalb z23.s, p4/M, z26.h, z7.h\n"
+    ".inst 0x44874755  // smlalt z21.s, p4/M, z26.h, z7.h\n"
+    ".inst 0x04bf7529  // sqrdmulh z9.s, z9.s, z31.s\n"
+    ".inst 0x04be7673  // sqrdmulh z19.s, z19.s, z30.s\n"
+    ".inst 0x44864337  // smlalb z23.s, p4/M, z25.h, z6.h\n"
+    ".inst 0x44864735  // smlalt z21.s, p4/M, z25.h, z6.h\n"
+    "and z10.d, z9.d, z0.d\n"
+    "and z24.d, z19.d, z22.d\n"
+    ".inst 0x448843b7  // smlalb z23.s, p4/M, z29.h, z8.h\n"
+    "asr z10.s, z10.s, #0x1f\n"
+    "asr z24.s, z24.s, #0x1f\n"
+    ".inst 0x448847b5  // smlalt z21.s, p4/M, z29.h, z8.h\n"
+    "sqadd z9.s, z9.s, z10.s\n"
+    "sqadd z19.s, z19.s, z24.s\n"
+    ".inst 0x04bf76f7  // sqrdmulh z23.s, z23.s, z31.s\n"
+    ".inst 0x04be76b5  // sqrdmulh z21.s, z21.s, z30.s\n"
+    ".inst 0x4482900d  // srshl z13.s, p4/M, z13.s, z0.s\n"
+    ".inst 0x448292d4  // srshl z20.s, p4/M, z20.s, z22.s\n"
+    "and z30.d, z23.d, z0.d\n"
+    "and z28.d, z21.d, z22.d\n"
+    "add z13.s, z13.s, z14.s\n"
+    "add z20.s, z20.s, z14.s\n"
+    "asr z30.s, z30.s, #0x1f\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "smin z13.s, p4/M, z13.s, z15.s\n"
+    "sqadd z23.s, z23.s, z30.s\n"
+    "sqadd z21.s, z21.s, z28.s\n"
+    "smin z20.s, p4/M, z20.s, z15.s\n"
+    "smax z13.s, p4/M, z13.s, z17.s\n"
+    ".inst 0x4482900b  // srshl z11.s, p4/M, z11.s, z0.s\n"
+    ".inst 0x448292d2  // srshl z18.s, p4/M, z18.s, z22.s\n"
+    "smax z20.s, p4/M, z20.s, z17.s\n"
+    ".inst 0x44829009  // srshl z9.s, p4/M, z9.s, z0.s\n"
+    "add z11.s, z11.s, z14.s\n"
+    "add z18.s, z18.s, z14.s\n"
+    "trn1 z13.h, z13.h, z20.h\n"
+    "st1b { z13.h }, p0, [x15, x7]\n"
+    "add z9.s, z9.s, z14.s\n"
+    "smin z11.s, p4/M, z11.s, z15.s\n"
+    "smin z18.s, p4/M, z18.s, z15.s\n"
+    ".inst 0x448292d3  // srshl z19.s, p4/M, z19.s, z22.s\n"
+    "smin z9.s, p4/M, z9.s, z15.s\n"
+    "smax z11.s, p4/M, z11.s, z17.s\n"
+    "smax z18.s, p4/M, z18.s, z17.s\n"
+    "add z19.s, z19.s, z14.s\n"
+    "smax z9.s, p4/M, z9.s, z17.s\n"
+    ".inst 0x44829017  // srshl z23.s, p4/M, z23.s, z0.s\n"
+    "trn1 z11.h, z11.h, z18.h\n"
+    "st1b { z11.h }, p0, [x14, x7]\n"
+    "smin z19.s, p4/M, z19.s, z15.s\n"
+    ".inst 0x448292d5  // srshl z21.s, p4/M, z21.s, z22.s\n"
+    "add z23.s, z23.s, z14.s\n"
+    "add z21.s, z21.s, z14.s\n"
+    "smax z19.s, p4/M, z19.s, z17.s\n"
+    "smin z23.s, p4/M, z23.s, z15.s\n"
+    "smin z21.s, p4/M, z21.s, z15.s\n"
+    "trn1 z9.h, z9.h, z19.h\n"
+    "st1b { z9.h }, p0, [x13, x7]\n"
+    "smax z23.s, p4/M, z23.s, z17.s\n"
+    "smax z21.s, p4/M, z21.s, z17.s\n"
+    "trn1 z23.h, z23.h, z21.h\n"
+    "st1b { z23.h }, p0, [x12, x7]\n"
+    "inch x7\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z20.s }, p2/Z, [x19]\n"
+    "ld1w { z10.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z13.s, z20.s, z10.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z20.s, z20.s, z10.s\n"
+    "mov z11.d, z13.d\n"
+    "ld1sb { z0.h }, p4/Z, [x5]\n"
+    "mov z9.d, z13.d\n"
+    "ld1sb { z1.h }, p4/Z, [x5, #1, MUL VL]\n"
+    "mov z18.d, z20.d\n"
+    "ld1sb { z2.h }, p4/Z, [x5, #2, MUL VL]\n"
+    "mov z19.d, z20.d\n"
+    "ld1sb { z3.h }, p4/Z, [x5, #3, MUL VL]\n"
+    "mov z23.d, z13.d\n"
+    "ld1sb { z4.h }, p4/Z, [x5, #4, MUL VL]\n"
+    "mov z21.d, z20.d\n"
+    "ld1sb { z5.h }, p4/Z, [x5, #5, MUL VL]\n"
+    ".inst 0x454c1000  // ssublb z0.h, z0.b, z12.b\n"
+    "ld1sb { z6.h }, p4/Z, [x5, #6, MUL VL]\n"
+    ".inst 0x454c1021  // ssublb z1.h, z1.b, z12.b\n"
+    "ld1sb { z7.h }, p4/Z, [x5, #7, MUL VL]\n"
+    "inch x5, ALL, MUL #8\n"
+    ".inst 0x454c1042  // ssublb z2.h, z2.b, z12.b\n"
+    "ld1sb { z8.h }, p4/Z, [x5]\n"
+    ".inst 0x454c1063  // ssublb z3.h, z3.b, z12.b\n"
+    "ldp x26, x25, [x17, #0x0]\n"
+    ".inst 0x454c1084  // ssublb z4.h, z4.b, z12.b\n"
+    "ldp x24, x23, [x17, #0x10]\n"
+    ".inst 0x454c10a5  // ssublb z5.h, z5.b, z12.b\n"
+    ".inst 0x454c10c6  // ssublb z6.h, z6.b, z12.b\n"
+    "ldp x22, x21, [x17, #0x20]\n"
+    ".inst 0x454c10e7  // ssublb z7.h, z7.b, z12.b\n"
+    ".inst 0x454c1108  // ssublb z8.h, z8.b, z12.b\n"
+    "ldp x20, x19, [x17, #0x30]\n"
+    "ld1b { z31.h }, p3/Z, [x26, x6]\n"
+    ".inst 0x45501bff  // usublb z31.h, z31.b, z16.b\n"
+    "ld1b { z30.h }, p3/Z, [x25, x6]\n"
+    "ld1b { z29.h }, p3/Z, [x24, x6]\n"
+    ".inst 0x45501bde  // usublb z30.h, z30.b, z16.b\n"
+    "ld1b { z28.h }, p3/Z, [x23, x6]\n"
+    ".inst 0x45501bbd  // usublb z29.h, z29.b, z16.b\n"
+    "ld1b { z27.h }, p3/Z, [x22, x6]\n"
+    "ld1b { z26.h }, p3/Z, [x21, x6]\n"
+    ".inst 0x45501b9c  // usublb z28.h, z28.b, z16.b\n"
+    "ld1b { z25.h }, p3/Z, [x20, x6]\n"
+    "ld1b { z24.h }, p3/Z, [x19, x6]\n"
+    ".inst 0x45501b7b  // usublb z27.h, z27.b, z16.b\n"
+    ".inst 0x45501b5a  // usublb z26.h, z26.b, z16.b\n"
+    ".inst 0x45501b39  // usublb z25.h, z25.b, z16.b\n"
+    ".inst 0x45501b18  // usublb z24.h, z24.b, z16.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000..906ef36
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+struct sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+  typedef int32_t bias_type;
+  typedef uint8_t input_type;
+  typedef int8_t weight_type;
+  typedef uint8_t return_type;
+
+  constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+
+  typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+  typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
+  typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+
+  constexpr static unsigned int kernel_rows = 5;
+  constexpr static unsigned int kernel_cols = 5;
+
+  constexpr static unsigned int stride_rows = 1;
+  constexpr static unsigned int stride_cols = 1;
+
+  constexpr static unsigned int output_rows = 2;
+  constexpr static unsigned int output_cols = 2;
+
+  constexpr static unsigned int input_rows = 6;
+  constexpr static unsigned int input_cols = 6;
+
+  constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters;
+  constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size;
+
+  kern_type kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+
+  sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000..6c321ef
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp

@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+  const unsigned int n_channels,
+  const uint8_t *const *const inptrs,
+  const int8_t *const weights,
+  const int32_t *const bias,
+  const arm_gemm::Requantize32 &qp,
+  const int32_t *const requant_muls,
+  const int32_t *const requant_shifts,
+  uint8_t *const *const outptrs
+)
+{
+  struct Params
+  {
+    long unsigned int n_channels;
+    const int8_t *weights;
+    const int32_t *bias;
+    const arm_gemm::Requantize32 *requant;
+    const int32_t *const requant_muls;
+    const int32_t *const requant_shifts;
+    uint8_t *const *const outptrs;
+    const uint8_t *inptrs[36];
+
+    Params(
+      long unsigned int n_channels,
+      const uint8_t *const *inptrs_raw,
+      const int8_t *const weights,
+      const int32_t *const bias,
+      const arm_gemm::Requantize32 &qp,
+      const int32_t *const requant_muls,
+      const int32_t *const requant_shifts,
+      uint8_t *const *outptrs
+    ) : n_channels(n_channels), weights(weights), bias(bias),
+        requant(&qp), requant_muls(requant_muls),
+        requant_shifts(requant_shifts), outptrs(outptrs)
+    {
+      inptrs[0] = inptrs_raw[0];
+      inptrs[1] = inptrs_raw[1];
+      inptrs[2] = inptrs_raw[6];
+      inptrs[3] = inptrs_raw[7];
+      inptrs[4] = inptrs_raw[2];
+      inptrs[5] = inptrs_raw[8];
+      inptrs[6] = inptrs_raw[3];
+      inptrs[7] = inptrs_raw[4];
+      inptrs[8] = inptrs_raw[11];
+      inptrs[9] = inptrs_raw[12];
+      inptrs[10] = inptrs_raw[9];
+      inptrs[11] = inptrs_raw[10];
+      inptrs[12] = inptrs_raw[5];
+      inptrs[13] = inptrs_raw[13];
+      inptrs[14] = inptrs_raw[14];
+      inptrs[15] = inptrs_raw[15];
+      inptrs[16] = inptrs_raw[16];
+      inptrs[17] = inptrs_raw[17];
+      inptrs[18] = inptrs_raw[18];
+      inptrs[19] = inptrs_raw[19];
+      inptrs[20] = inptrs_raw[20];
+      inptrs[21] = inptrs_raw[21];
+      inptrs[22] = inptrs_raw[22];
+      inptrs[23] = inptrs_raw[23];
+      inptrs[24] = inptrs_raw[24];
+      inptrs[25] = inptrs_raw[25];
+      inptrs[26] = inptrs_raw[26];
+      inptrs[27] = inptrs_raw[27];
+      inptrs[28] = inptrs_raw[28];
+      inptrs[29] = inptrs_raw[29];
+      inptrs[30] = inptrs_raw[30];
+      inptrs[31] = inptrs_raw[31];
+      inptrs[32] = inptrs_raw[32];
+      inptrs[33] = inptrs_raw[33];
+      inptrs[34] = inptrs_raw[34];
+      inptrs[35] = inptrs_raw[35];
+
+    }
+  };
+
+  const Params params(n_channels, inptrs, weights, bias, qp,
+                      requant_muls, requant_shifts, outptrs);
+
+  __asm__ __volatile__(
+    "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+    "ptrue p4.b\n"
+    "ldr x1, [%x[params], %[offsetof_Params_weights]]\n"
+    "mov x2, #0x0\n"
+    "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
+    "mov x3, #0x0\n"
+    "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n"
+    "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+    "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+    "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
+    "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+    "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
+    "ld1rb { z9.b }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
+    "ld1rb { z14.b }, p4/Z, [x20]\n"
+    "add x20, x22, %[offsetof_Requantize32_minval]\n"
+    "ld1rw { z17.s }, p4/Z, [x19]\n"
+    "add x19, x22, %[offsetof_Requantize32_maxval]\n"
+    "ld1rw { z12.s }, p4/Z, [x20]\n"
+    "whilelt p3.h, x2, x0\n"
+    "ld1rw { z11.s }, p4/Z, [x19]\n"
+    "whilelt p2.s, x2, x0\n"
+    "ldp x7, x8, [x21, #0x0]\n"
+    "mov x19, x2\n"
+    "incw x19\n"
+    "ldp x17, x16, [x21, #0x10]\n"
+    "whilelt p1.s, x19, x0\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z4.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z15.s, z4.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z18.s, z4.s, z16.s\n"
+    "mov z21.d, z15.d\n"
+    "ld1sb { z0.h }, p4/Z, [x1]\n"
+    "mov z5.d, z15.d\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+    "mov z13.d, z18.d\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+    "mov z7.d, z18.d\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+    "mov z6.d, z15.d\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+    "mov z8.d, z18.d\n"
+    "ldp x28, x27, [x5, #0x0]\n"
+    ".inst 0x454e1000  // ssublb z0.h, z0.b, z14.b\n"
+    "ldp x26, x25, [x5, #0x10]\n"
+    ".inst 0x454e1021  // ssublb z1.h, z1.b, z14.b\n"
+    ".inst 0x454e1042  // ssublb z2.h, z2.b, z14.b\n"
+    "ldp x24, x23, [x5, #0x20]\n"
+    ".inst 0x454e1063  // ssublb z3.h, z3.b, z14.b\n"
+    ".inst 0x454e1084  // ssublb z4.h, z4.b, z14.b\n"
+    "ldp x22, x21, [x5, #0x30]\n"
+    "ldp x20, x19, [x5, #0x40]\n"
+    "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x45491bff  // usublb z31.h, z31.b, z9.b\n"
+    "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x45491bde  // usublb z30.h, z30.b, z9.b\n"
+    "ld1b { z28.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x45491bbd  // usublb z29.h, z29.b, z9.b\n"
+    "ld1b { z27.h }, p3/Z, [x24, x2]\n"
+    "ld1b { z23.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x45491b9c  // usublb z28.h, z28.b, z9.b\n"
+    "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+    "ld1b { z24.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x45491b7b  // usublb z27.h, z27.b, z9.b\n"
+    ".inst 0x45491af7  // usublb z23.h, z23.b, z9.b\n"
+    "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+    "ld1b { z22.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x45491b39  // usublb z25.h, z25.b, z9.b\n"
+    ".inst 0x45491b18  // usublb z24.h, z24.b, z9.b\n"
+    ".inst 0x45491b5a  // usublb z26.h, z26.b, z9.b\n"
+    ".inst 0x45491ad6  // usublb z22.h, z22.b, z9.b\n"
+    "1:"  // Loop
+    ".inst 0x448043ef  // smlalb z15.s, p4/M, z31.h, z0.h\n"
+    "ldr x20, [x5, #0x50]\n"
+    "whilelt p0.h, x3, x0\n"
+    ".inst 0x448047f2  // smlalt z18.s, p4/M, z31.h, z0.h\n"
+    "ldr x19, [x5, #0x58]\n"
+    ".inst 0x448043d5  // smlalb z21.s, p4/M, z30.h, z0.h\n"
+    "ldr x25, [x5, #0x60]\n"
+    ".inst 0x448047cd  // smlalt z13.s, p4/M, z30.h, z0.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x448043a5  // smlalb z5.s, p4/M, z29.h, z0.h\n"
+    "ldr x24, [x5, #0x68]\n"
+    ".inst 0x448047a7  // smlalt z7.s, p4/M, z29.h, z0.h\n"
+    "ldr x23, [x5, #0x70]\n"
+    ".inst 0x44804386  // smlalb z6.s, p4/M, z28.h, z0.h\n"
+    "ldr x22, [x5, #0x78]\n"
+    ".inst 0x45491bff  // usublb z31.h, z31.b, z9.b\n"
+    "ldr x15, [x5, #0x80]\n"
+    ".inst 0x44804788  // smlalt z8.s, p4/M, z28.h, z0.h\n"
+    "ld1sb { z0.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x448143cf  // smlalb z15.s, p4/M, z30.h, z1.h\n"
+    "ldr x21, [x5, #0x88]\n"
+    ".inst 0x448147d2  // smlalt z18.s, p4/M, z30.h, z1.h\n"
+    "ld1b { z30.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x44814375  // smlalb z21.s, p4/M, z27.h, z1.h\n"
+    "ldr x20, [x5, #0x90]\n"
+    ".inst 0x454e1000  // ssublb z0.h, z0.b, z14.b\n"
+    "ldr x19, [x5, #0x98]\n"
+    ".inst 0x4481476d  // smlalt z13.s, p4/M, z27.h, z1.h\n"
+    "ldr x14, [x5, #0xa0]\n"
+    ".inst 0x45491bde  // usublb z30.h, z30.b, z9.b\n"
+    "ldr x13, [x5, #0xa8]\n"
+    ".inst 0x44814385  // smlalb z5.s, p4/M, z28.h, z1.h\n"
+    "ldr x12, [x5, #0xb0]\n"
+    ".inst 0x44814787  // smlalt z7.s, p4/M, z28.h, z1.h\n"
+    "ldr x11, [x5, #0xb8]\n"
+    ".inst 0x448142e6  // smlalb z6.s, p4/M, z23.h, z1.h\n"
+    "ldr x10, [x5, #0xc0]\n"
+    ".inst 0x448146e8  // smlalt z8.s, p4/M, z23.h, z1.h\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x4482436f  // smlalb z15.s, p4/M, z27.h, z2.h\n"
+    "ldr x9, [x5, #0xc8]\n"
+    ".inst 0x44824772  // smlalt z18.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z27.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x44824335  // smlalb z21.s, p4/M, z25.h, z2.h\n"
+    "ldr x28, [x5, #0xd0]\n"
+    ".inst 0x454e1021  // ssublb z1.h, z1.b, z14.b\n"
+    "ldr x27, [x5, #0xd8]\n"
+    ".inst 0x4482472d  // smlalt z13.s, p4/M, z25.h, z2.h\n"
+    "ldr x26, [x5, #0xe0]\n"
+    ".inst 0x45491b7b  // usublb z27.h, z27.b, z9.b\n"
+    "ld1w { z19.s }, p2/Z, [x4]\n"
+    ".inst 0x448242e5  // smlalb z5.s, p4/M, z23.h, z2.h\n"
+    "ld1w { z16.s }, p1/Z, [x4, #1, MUL VL]\n"
+    "addvl x4, x4, #2\n"
+    ".inst 0x448246e7  // smlalt z7.s, p4/M, z23.h, z2.h\n"
+    ".inst 0x448243e6  // smlalb z6.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448247e8  // smlalt z8.s, p4/M, z31.h, z2.h\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    "uzp1 z10.s, z19.s, z16.s\n"
+    "uzp2 z20.s, z19.s, z16.s\n"
+    "ld1w { z19.s }, p2/Z, [x6]\n"
+    ".inst 0x4483432f  // smlalb z15.s, p4/M, z25.h, z3.h\n"
+    "ld1w { z16.s }, p1/Z, [x6, #1, MUL VL]\n"
+    "addvl x6, x6, #2\n"
+    ".inst 0x454e1042  // ssublb z2.h, z2.b, z14.b\n"
+    ".inst 0x44834732  // smlalt z18.s, p4/M, z25.h, z3.h\n"
+    "ld1b { z25.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x44834315  // smlalb z21.s, p4/M, z24.h, z3.h\n"
+    "ldr x25, [x5, #0xe8]\n"
+    ".inst 0x4483470d  // smlalt z13.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x448343e5  // smlalb z5.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x45491b39  // usublb z25.h, z25.b, z9.b\n"
+    ".inst 0x448347e7  // smlalt z7.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x448343c6  // smlalb z6.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x448347c8  // smlalt z8.s, p4/M, z30.h, z3.h\n"
+    "ld1sb { z3.h }, p4/Z, [x1]\n"
+    ".inst 0x4484430f  // smlalb z15.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x44844712  // smlalt z18.s, p4/M, z24.h, z4.h\n"
+    "ld1b { z24.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x44844375  // smlalb z21.s, p4/M, z27.h, z4.h\n"
+    "ldr x24, [x5, #0xf0]\n"
+    ".inst 0x454e1063  // ssublb z3.h, z3.b, z14.b\n"
+    ".inst 0x4484476d  // smlalt z13.s, p4/M, z27.h, z4.h\n"
+    "ld1b { z27.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x45491b18  // usublb z24.h, z24.b, z9.b\n"
+    "ldr x23, [x5, #0xf8]\n"
+    ".inst 0x448443c5  // smlalb z5.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x448447c7  // smlalt z7.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x45491b7b  // usublb z27.h, z27.b, z9.b\n"
+    ".inst 0x44844346  // smlalb z6.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x44844748  // smlalt z8.s, p4/M, z26.h, z4.h\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #1, MUL VL]\n"
+    ".inst 0x448043af  // smlalb z15.s, p4/M, z29.h, z0.h\n"
+    ".inst 0x448047b2  // smlalt z18.s, p4/M, z29.h, z0.h\n"
+    "uzp1 z29.s, z19.s, z16.s\n"
+    ".inst 0x454e1084  // ssublb z4.h, z4.b, z14.b\n"
+    "uzp2 z19.s, z19.s, z16.s\n"
+    ".inst 0x44804395  // smlalb z21.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x4480478d  // smlalt z13.s, p4/M, z28.h, z0.h\n"
+    ".inst 0x448042c5  // smlalb z5.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x448046c7  // smlalt z7.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x44804326  // smlalb z6.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804728  // smlalt z8.s, p4/M, z25.h, z0.h\n"
+    "ld1sb { z0.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x4481438f  // smlalb z15.s, p4/M, z28.h, z1.h\n"
+    ".inst 0x44814792  // smlalt z18.s, p4/M, z28.h, z1.h\n"
+    "ld1b { z28.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x448142f5  // smlalb z21.s, p4/M, z23.h, z1.h\n"
+    "ldr x22, [x5, #0x100]\n"
+    ".inst 0x454e1000  // ssublb z0.h, z0.b, z14.b\n"
+    ".inst 0x448146ed  // smlalt z13.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x45491b9c  // usublb z28.h, z28.b, z9.b\n"
+    ".inst 0x44814325  // smlalb z5.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x44814727  // smlalt z7.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x44814306  // smlalb z6.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x44814708  // smlalt z8.s, p4/M, z24.h, z1.h\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #3, MUL VL]\n"
+    ".inst 0x448242ef  // smlalb z15.s, p4/M, z23.h, z2.h\n"
+    ".inst 0x448246f2  // smlalt z18.s, p4/M, z23.h, z2.h\n"
+    "ld1b { z23.h }, p3/Z, [x15, x2]\n"
+    ".inst 0x448243f5  // smlalb z21.s, p4/M, z31.h, z2.h\n"
+    "ldr x21, [x5, #0x108]\n"
+    ".inst 0x454e1021  // ssublb z1.h, z1.b, z14.b\n"
+    ".inst 0x448247ed  // smlalt z13.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x45491af7  // usublb z23.h, z23.b, z9.b\n"
+    ".inst 0x44824305  // smlalb z5.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x44824707  // smlalt z7.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x44824366  // smlalb z6.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824768  // smlalt z8.s, p4/M, z27.h, z2.h\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x448343ef  // smlalb z15.s, p4/M, z31.h, z3.h\n"
+    ".inst 0x448347f2  // smlalt z18.s, p4/M, z31.h, z3.h\n"
+    "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x448343d5  // smlalb z21.s, p4/M, z30.h, z3.h\n"
+    "ldr x20, [x5, #0x110]\n"
+    ".inst 0x454e1042  // ssublb z2.h, z2.b, z14.b\n"
+    ".inst 0x448347cd  // smlalt z13.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x45491bff  // usublb z31.h, z31.b, z9.b\n"
+    ".inst 0x44834365  // smlalb z5.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834767  // smlalt z7.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x448342e6  // smlalb z6.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x448346e8  // smlalt z8.s, p4/M, z23.h, z3.h\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x448443cf  // smlalb z15.s, p4/M, z30.h, z4.h\n"
+    ".inst 0x448447d2  // smlalt z18.s, p4/M, z30.h, z4.h\n"
+    "ld1b { z30.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x44844355  // smlalb z21.s, p4/M, z26.h, z4.h\n"
+    "ldr x19, [x5, #0x118]\n"
+    ".inst 0x454e1063  // ssublb z3.h, z3.b, z14.b\n"
+    ".inst 0x4484474d  // smlalt z13.s, p4/M, z26.h, z4.h\n"
+    "ld1b { z26.h }, p3/Z, [x14, x2]\n"
+    ".inst 0x45491bde  // usublb z30.h, z30.b, z9.b\n"
+    ".inst 0x448442e5  // smlalb z5.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x448446e7  // smlalt z7.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x45491b5a  // usublb z26.h, z26.b, z9.b\n"
+    ".inst 0x44844386  // smlalb z6.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844788  // smlalt z8.s, p4/M, z28.h, z4.h\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x448042cf  // smlalb z15.s, p4/M, z22.h, z0.h\n"
+    ".inst 0x448046d2  // smlalt z18.s, p4/M, z22.h, z0.h\n"
+    "ld1b { z22.h }, p3/Z, [x11, x2]\n"
+    ".inst 0x44804335  // smlalb z21.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x454e1084  // ssublb z4.h, z4.b, z14.b\n"
+    ".inst 0x4480472d  // smlalt z13.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x45491ad6  // usublb z22.h, z22.b, z9.b\n"
+    ".inst 0x448043e5  // smlalb z5.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448047e7  // smlalt z7.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448043c6  // smlalb z6.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x448047c8  // smlalt z8.s, p4/M, z30.h, z0.h\n"
+    "ld1sb { z0.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    ".inst 0x4481432f  // smlalb z15.s, p4/M, z25.h, z1.h\n"
+    ".inst 0x44814732  // smlalt z18.s, p4/M, z25.h, z1.h\n"
+    "ld1b { z25.h }, p3/Z, [x13, x2]\n"
+    ".inst 0x44814315  // smlalb z21.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x454e1000  // ssublb z0.h, z0.b, z14.b\n"
+    ".inst 0x4481470d  // smlalt z13.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x45491b39  // usublb z25.h, z25.b, z9.b\n"
+    ".inst 0x448143c5  // smlalb z5.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x448147c7  // smlalt z7.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x44814346  // smlalb z6.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x44814748  // smlalt z8.s, p4/M, z26.h, z1.h\n"
+    "ld1sb { z1.h }, p4/Z, [x1]\n"
+    ".inst 0x4482430f  // smlalb z15.s, p4/M, z24.h, z2.h\n"
+    ".inst 0x44824712  // smlalt z18.s, p4/M, z24.h, z2.h\n"
+    "ld1b { z24.h }, p3/Z, [x12, x2]\n"
+    ".inst 0x44824375  // smlalb z21.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x454e1021  // ssublb z1.h, z1.b, z14.b\n"
+    ".inst 0x4482476d  // smlalt z13.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x45491b18  // usublb z24.h, z24.b, z9.b\n"
+    ".inst 0x44824345  // smlalb z5.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824747  // smlalt z7.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824326  // smlalb z6.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x44824728  // smlalt z8.s, p4/M, z25.h, z2.h\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #1, MUL VL]\n"
+    ".inst 0x4483436f  // smlalb z15.s, p4/M, z27.h, z3.h\n"
+    ".inst 0x44834772  // smlalt z18.s, p4/M, z27.h, z3.h\n"
+    "ld1b { z27.h }, p3/Z, [x10, x2]\n"
+    ".inst 0x448342f5  // smlalb z21.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x454e1042  // ssublb z2.h, z2.b, z14.b\n"
+    ".inst 0x448346ed  // smlalt z13.s, p4/M, z23.h, z3.h\n"
+    ".inst 0x45491b7b  // usublb z27.h, z27.b, z9.b\n"
+    ".inst 0x44834325  // smlalb z5.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834727  // smlalt z7.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834306  // smlalb z6.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834708  // smlalt z8.s, p4/M, z24.h, z3.h\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #2, MUL VL]\n"
+    ".inst 0x448442ef  // smlalb z15.s, p4/M, z23.h, z4.h\n"
+    ".inst 0x448446f2  // smlalt z18.s, p4/M, z23.h, z4.h\n"
+    "ld1b { z23.h }, p3/Z, [x9, x2]\n"
+    ".inst 0x44844395  // smlalb z21.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x454e1063  // ssublb z3.h, z3.b, z14.b\n"
+    ".inst 0x4484478d  // smlalt z13.s, p4/M, z28.h, z4.h\n"
+    "ld1b { z28.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x45491af7  // usublb z23.h, z23.b, z9.b\n"
+    ".inst 0x44844305  // smlalb z5.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x44844707  // smlalt z7.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x45491b9c  // usublb z28.h, z28.b, z9.b\n"
+    ".inst 0x448442c6  // smlalb z6.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x448446c8  // smlalt z8.s, p4/M, z22.h, z4.h\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #3, MUL VL]\n"
+    ".inst 0x448043ef  // smlalb z15.s, p4/M, z31.h, z0.h\n"
+    ".inst 0x448047f2  // smlalt z18.s, p4/M, z31.h, z0.h\n"
+    "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x448043d5  // smlalb z21.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x454e1084  // ssublb z4.h, z4.b, z14.b\n"
+    ".inst 0x448047cd  // smlalt z13.s, p4/M, z30.h, z0.h\n"
+    ".inst 0x45491bff  // usublb z31.h, z31.b, z9.b\n"
+    ".inst 0x44804365  // smlalb z5.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x44804767  // smlalt z7.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x448042e6  // smlalb z6.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x448046e8  // smlalt z8.s, p4/M, z23.h, z0.h\n"
+    "ld1sb { z0.h }, p4/Z, [x1, #4, MUL VL]\n"
+    ".inst 0x448143cf  // smlalb z15.s, p4/M, z30.h, z1.h\n"
+    ".inst 0x448147d2  // smlalt z18.s, p4/M, z30.h, z1.h\n"
+    "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+    ".inst 0x44814355  // smlalb z21.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x454e1000  // ssublb z0.h, z0.b, z14.b\n"
+    ".inst 0x4481474d  // smlalt z13.s, p4/M, z26.h, z1.h\n"
+    ".inst 0x45491bde  // usublb z30.h, z30.b, z9.b\n"
+    ".inst 0x448142e5  // smlalb z5.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448146e7  // smlalt z7.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448143e6  // smlalb z6.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147e8  // smlalt z8.s, p4/M, z31.h, z1.h\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #5, MUL VL]\n"
+    ".inst 0x4482434f  // smlalb z15.s, p4/M, z26.h, z2.h\n"
+    ".inst 0x44824752  // smlalt z18.s, p4/M, z26.h, z2.h\n"
+    "ld1b { z26.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x44824335  // smlalb z21.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x454e1021  // ssublb z1.h, z1.b, z14.b\n"
+    ".inst 0x4482472d  // smlalt z13.s, p4/M, z25.h, z2.h\n"
+    ".inst 0x45491b5a  // usublb z26.h, z26.b, z9.b\n"
+    ".inst 0x448243e5  // smlalb z5.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448247e7  // smlalt z7.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448243c6  // smlalb z6.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x448247c8  // smlalt z8.s, p4/M, z30.h, z2.h\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #6, MUL VL]\n"
+    ".inst 0x4483432f  // smlalb z15.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834732  // smlalt z18.s, p4/M, z25.h, z3.h\n"
+    "ld1b { z25.h }, p3/Z, [x24, x2]\n"
+    ".inst 0x44834315  // smlalb z21.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x454e1042  // ssublb z2.h, z2.b, z14.b\n"
+    ".inst 0x4483470d  // smlalt z13.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x45491b39  // usublb z25.h, z25.b, z9.b\n"
+    ".inst 0x448343c5  // smlalb z5.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x448347c7  // smlalt z7.s, p4/M, z30.h, z3.h\n"
+    ".inst 0x44834386  // smlalb z6.s, p4/M, z28.h, z3.h\n"
+    ".inst 0x44834788  // smlalt z8.s, p4/M, z28.h, z3.h\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #7, MUL VL]\n"
+    "inch x1, ALL, MUL #8\n"
+    ".inst 0x4484430f  // smlalb z15.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x44844712  // smlalt z18.s, p4/M, z24.h, z4.h\n"
+    "ld1b { z24.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x448442d5  // smlalb z21.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x454e1063  // ssublb z3.h, z3.b, z14.b\n"
+    ".inst 0x448446cd  // smlalt z13.s, p4/M, z22.h, z4.h\n"
+    ".inst 0x45491b18  // usublb z24.h, z24.b, z9.b\n"
+    ".inst 0x44844385  // smlalb z5.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844787  // smlalt z7.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844346  // smlalb z6.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x44844748  // smlalt z8.s, p4/M, z26.h, z4.h\n"
+    "ld1sb { z4.h }, p4/Z, [x1]\n"
+    "inch x1\n"
+    ".inst 0x4480436f  // smlalb z15.s, p4/M, z27.h, z0.h\n"
+    ".inst 0x44804772  // smlalt z18.s, p4/M, z27.h, z0.h\n"
+    "ld1b { z27.h }, p3/Z, [x22, x2]\n"
+    ".inst 0x448042f5  // smlalb z21.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x454e1084  // ssublb z4.h, z4.b, z14.b\n"
+    ".inst 0x448046ed  // smlalt z13.s, p4/M, z23.h, z0.h\n"
+    ".inst 0x45491b7b  // usublb z27.h, z27.b, z9.b\n"
+    ".inst 0x44804325  // smlalb z5.s, p4/M, z25.h, z0.h\n"
+    ".inst 0x44804727  // smlalt z7.s, p4/M, z25.h, z0.h\n"
+    "ld1b { z25.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x44804306  // smlalb z6.s, p4/M, z24.h, z0.h\n"
+    ".inst 0x44804708  // smlalt z8.s, p4/M, z24.h, z0.h\n"
+    ".inst 0x448142ef  // smlalb z15.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x45491b39  // usublb z25.h, z25.b, z9.b\n"
+    ".inst 0x448146f2  // smlalt z18.s, p4/M, z23.h, z1.h\n"
+    ".inst 0x448143f5  // smlalb z21.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x448147ed  // smlalt z13.s, p4/M, z31.h, z1.h\n"
+    ".inst 0x44814305  // smlalb z5.s, p4/M, z24.h, z1.h\n"
+    ".inst 0x44814707  // smlalt z7.s, p4/M, z24.h, z1.h\n"
+    "ld1b { z24.h }, p3/Z, [x20, x2]\n"
+    ".inst 0x44814366  // smlalb z6.s, p4/M, z27.h, z1.h\n"
+    ".inst 0x44814768  // smlalt z8.s, p4/M, z27.h, z1.h\n"
+    ".inst 0x448243ef  // smlalb z15.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x45491b18  // usublb z24.h, z24.b, z9.b\n"
+    ".inst 0x448247f2  // smlalt z18.s, p4/M, z31.h, z2.h\n"
+    ".inst 0x448243d5  // smlalb z21.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x448247cd  // smlalt z13.s, p4/M, z30.h, z2.h\n"
+    ".inst 0x44824365  // smlalb z5.s, p4/M, z27.h, z2.h\n"
+    ".inst 0x44824767  // smlalt z7.s, p4/M, z27.h, z2.h\n"
+    "ld1b { z27.h }, p3/Z, [x19, x2]\n"
+    "inch x2\n"
+    ".inst 0x44824326  // smlalb z6.s, p4/M, z25.h, z2.h\n"
+    "whilelt p2.s, x2, x0\n"
+    ".inst 0x44824728  // smlalt z8.s, p4/M, z25.h, z2.h\n"
+    "mov x19, x2\n"
+    ".inst 0x448343cf  // smlalb z15.s, p4/M, z30.h, z3.h\n"
+    "incw x19\n"
+    ".inst 0x45491b7b  // usublb z27.h, z27.b, z9.b\n"
+    "whilelt p1.s, x19, x0\n"
+    ".inst 0x448347d2  // smlalt z18.s, p4/M, z30.h, z3.h\n"
+    "whilelt p3.h, x2, x0\n"
+    ".inst 0x44834395  // smlalb z21.s, p4/M, z28.h, z3.h\n"
+    ".inst 0x4483478d  // smlalt z13.s, p4/M, z28.h, z3.h\n"
+    ".inst 0x44834325  // smlalb z5.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834727  // smlalt z7.s, p4/M, z25.h, z3.h\n"
+    ".inst 0x44834306  // smlalb z6.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x44834708  // smlalt z8.s, p4/M, z24.h, z3.h\n"
+    ".inst 0x4484438f  // smlalb z15.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844792  // smlalt z18.s, p4/M, z28.h, z4.h\n"
+    ".inst 0x44844355  // smlalb z21.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x4484474d  // smlalt z13.s, p4/M, z26.h, z4.h\n"
+    ".inst 0x04aa75ef  // sqrdmulh z15.s, z15.s, z10.s\n"
+    ".inst 0x04b47652  // sqrdmulh z18.s, z18.s, z20.s\n"
+    ".inst 0x04aa76b5  // sqrdmulh z21.s, z21.s, z10.s\n"
+    ".inst 0x04b475ad  // sqrdmulh z13.s, z13.s, z20.s\n"
+    "and z28.d, z15.d, z29.d\n"
+    "and z26.d, z18.d, z19.d\n"
+    "and z16.d, z21.d, z29.d\n"
+    "asr z28.s, z28.s, #0x1f\n"
+    "asr z26.s, z26.s, #0x1f\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "sqadd z15.s, z15.s, z28.s\n"
+    "sqadd z18.s, z18.s, z26.s\n"
+    "sqadd z21.s, z21.s, z16.s\n"
+    "and z16.d, z13.d, z19.d\n"
+    ".inst 0x44844305  // smlalb z5.s, p4/M, z24.h, z4.h\n"
+    ".inst 0x44844707  // smlalt z7.s, p4/M, z24.h, z4.h\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    ".inst 0x44844366  // smlalb z6.s, p4/M, z27.h, z4.h\n"
+    ".inst 0x04aa74a5  // sqrdmulh z5.s, z5.s, z10.s\n"
+    "sqadd z13.s, z13.s, z16.s\n"
+    ".inst 0x04b474e7  // sqrdmulh z7.s, z7.s, z20.s\n"
+    ".inst 0x04aa74c6  // sqrdmulh z6.s, z6.s, z10.s\n"
+    "and z16.d, z5.d, z29.d\n"
+    ".inst 0x44844768  // smlalt z8.s, p4/M, z27.h, z4.h\n"
+    "and z25.d, z7.d, z19.d\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "and z26.d, z6.d, z29.d\n"
+    "asr z25.s, z25.s, #0x1f\n"
+    "sqadd z5.s, z5.s, z16.s\n"
+    "asr z26.s, z26.s, #0x1f\n"
+    "sqadd z7.s, z7.s, z25.s\n"
+    ".inst 0x04b47508  // sqrdmulh z8.s, z8.s, z20.s\n"
+    "sqadd z6.s, z6.s, z26.s\n"
+    ".inst 0x448293af  // srshl z15.s, p4/M, z15.s, z29.s\n"
+    ".inst 0x44829272  // srshl z18.s, p4/M, z18.s, z19.s\n"
+    "and z16.d, z8.d, z19.d\n"
+    ".inst 0x448293b5  // srshl z21.s, p4/M, z21.s, z29.s\n"
+    "add z15.s, z15.s, z17.s\n"
+    "add z18.s, z18.s, z17.s\n"
+    "asr z16.s, z16.s, #0x1f\n"
+    "add z21.s, z21.s, z17.s\n"
+    "smin z15.s, p4/M, z15.s, z11.s\n"
+    "sqadd z8.s, z8.s, z16.s\n"
+    "smin z18.s, p4/M, z18.s, z11.s\n"
+    "smin z21.s, p4/M, z21.s, z11.s\n"
+    "smax z15.s, p4/M, z15.s, z12.s\n"
+    ".inst 0x4482926d  // srshl z13.s, p4/M, z13.s, z19.s\n"
+    "smax z18.s, p4/M, z18.s, z12.s\n"
+    "smax z21.s, p4/M, z21.s, z12.s\n"
+    ".inst 0x448293a5  // srshl z5.s, p4/M, z5.s, z29.s\n"
+    "add z13.s, z13.s, z17.s\n"
+    "trn1 z15.h, z15.h, z18.h\n"
+    "st1b { z15.h }, p0, [x7, x3]\n"
+    "add z5.s, z5.s, z17.s\n"
+    "smin z13.s, p4/M, z13.s, z11.s\n"
+    ".inst 0x44829267  // srshl z7.s, p4/M, z7.s, z19.s\n"
+    ".inst 0x448293a6  // srshl z6.s, p4/M, z6.s, z29.s\n"
+    "smin z5.s, p4/M, z5.s, z11.s\n"
+    "smax z13.s, p4/M, z13.s, z12.s\n"
+    "add z7.s, z7.s, z17.s\n"
+    "add z6.s, z6.s, z17.s\n"
+    "smax z5.s, p4/M, z5.s, z12.s\n"
+    "trn1 z21.h, z21.h, z13.h\n"
+    "st1b { z21.h }, p0, [x8, x3]\n"
+    "smin z7.s, p4/M, z7.s, z11.s\n"
+    "smin z6.s, p4/M, z6.s, z11.s\n"
+    ".inst 0x44829268  // srshl z8.s, p4/M, z8.s, z19.s\n"
+    "smax z7.s, p4/M, z7.s, z12.s\n"
+    "smax z6.s, p4/M, z6.s, z12.s\n"
+    "add z8.s, z8.s, z17.s\n"
+    "trn1 z5.h, z5.h, z7.h\n"
+    "st1b { z5.h }, p0, [x17, x3]\n"
+    "smin z8.s, p4/M, z8.s, z11.s\n"
+    "smax z8.s, p4/M, z8.s, z12.s\n"
+    "trn1 z6.h, z6.h, z8.h\n"
+    "st1b { z6.h }, p0, [x16, x3]\n"
+    "inch x3\n"
+    "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "ld1w { z4.s }, p2/Z, [x19]\n"
+    "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n"
+    "uzp1 z15.s, z4.s, z16.s\n"
+    "addvl x19, x19, #2\n"
+    "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+    "uzp2 z18.s, z4.s, z16.s\n"
+    "mov z21.d, z15.d\n"
+    "ld1sb { z0.h }, p4/Z, [x1]\n"
+    "mov z5.d, z15.d\n"
+    "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n"
+    "mov z13.d, z18.d\n"
+    "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n"
+    "mov z7.d, z18.d\n"
+    "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n"
+    "mov z6.d, z15.d\n"
+    "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n"
+    "mov z8.d, z18.d\n"
+    "ldp x28, x27, [x5, #0x0]\n"
+    ".inst 0x454e1000  // ssublb z0.h, z0.b, z14.b\n"
+    "ldp x26, x25, [x5, #0x10]\n"
+    ".inst 0x454e1021  // ssublb z1.h, z1.b, z14.b\n"
+    ".inst 0x454e1042  // ssublb z2.h, z2.b, z14.b\n"
+    "ldp x24, x23, [x5, #0x20]\n"
+    ".inst 0x454e1063  // ssublb z3.h, z3.b, z14.b\n"
+    ".inst 0x454e1084  // ssublb z4.h, z4.b, z14.b\n"
+    "ldp x22, x21, [x5, #0x30]\n"
+    "ldp x20, x19, [x5, #0x40]\n"
+    "ld1b { z31.h }, p3/Z, [x28, x2]\n"
+    ".inst 0x45491bff  // usublb z31.h, z31.b, z9.b\n"
+    "ld1b { z30.h }, p3/Z, [x27, x2]\n"
+    "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+    ".inst 0x45491bde  // usublb z30.h, z30.b, z9.b\n"
+    "ld1b { z28.h }, p3/Z, [x25, x2]\n"
+    ".inst 0x45491bbd  // usublb z29.h, z29.b, z9.b\n"
+    "ld1b { z27.h }, p3/Z, [x24, x2]\n"
+    "ld1b { z23.h }, p3/Z, [x23, x2]\n"
+    ".inst 0x45491b9c  // usublb z28.h, z28.b, z9.b\n"
+    "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+    "ld1b { z24.h }, p3/Z, [x21, x2]\n"
+    ".inst 0x45491b7b  // usublb z27.h, z27.b, z9.b\n"
+    ".inst 0x45491af7  // usublb z23.h, z23.b, z9.b\n"
+    "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+    "ld1b { z22.h }, p3/Z, [x19, x2]\n"
+    ".inst 0x45491b39  // usublb z25.h, z25.b, z9.b\n"
+    ".inst 0x45491b18  // usublb z24.h, z24.b, z9.b\n"
+    ".inst 0x45491b5a  // usublb z26.h, z26.b, z9.b\n"
+    ".inst 0x45491ad6  // usublb z22.h, z22.b, z9.b\n"
+    "b.any 1b\n"
+    :
+    : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+    : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+  );
+}
+
+}  // namespace depthwise
+}  // namespace arm_conv
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2)

diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 6d483a3..1269ef6 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp

@@ -175,6 +175,18 @@
 #endif
 }
 
+template <typename T>
+inline unsigned long get_vector_length(VLType vl_type) {
+  switch (vl_type) {
+#ifdef __ARM_FEATURE_SVE
+    case VLType::SVE:
+      return get_vector_length_sz<sizeof(T)>();
+#endif
+    default:
+      return 16 / sizeof(T);
+  }
+}
+
 } // utils namespace
 } // arm_gemm namespace
 

diff --git a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
deleted file mode 100644
index a956898..0000000
--- a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ /dev/null

@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-#define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** This class is a wrapper for the depthwise convolution assembly kernels.  */
-class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionAssemblyKernelWrapper";
-    }
-
-    /** Default constructor */
-    NEDepthwiseConvolutionAssemblyKernelWrapper()
-        : _kernel(nullptr)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel Pointer to an assembly kernel implementation.
-     */
-    void configure(depthwise::IDepthwiseConvolution *kernel)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel = kernel;
-        Window win;
-        win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
-        INEKernel::configure(win);
-    }
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-        auto first = window.x().start();
-        auto last  = window.x().end();
-        _kernel->run(first, last, info.thread_id);
-    }
-
-private:
-    depthwise::IDepthwiseConvolution *_kernel;
-};
-} // namespace arm_compute
-#endif /* SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/assembly/common.hpp
similarity index 75%
rename from src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp
rename to src/core/NEON/kernels/assembly/common.hpp
index c13dd70..d82d11c 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp
+++ b/src/core/NEON/kernels/assembly/common.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "impl_fp32_fp32.hpp"
 
-namespace depthwise
+#pragma once
+
+namespace arm_conv
 {
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
-template class DepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>;
-template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>;
-}  // namespace depthwise
+struct PaddingValues
+{
+    unsigned int left, top, right, bottom;
+};
+
+} // namespace arm_conv

diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
new file mode 100644
index 0000000..eadf48d
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp

@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "arm_gemm_local.hpp"
+#include "depthwise_common.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+struct DepthwiseConfig
+{
+    DepthwiseMethod method = DepthwiseMethod::DEFAULT;
+    std::string     filter = "";
+
+    DepthwiseConfig(DepthwiseMethod method)
+        : method(method) {};
+    DepthwiseConfig() {};
+};
+
+struct DepthwiseArgs
+{
+    const CPUInfo *cpu_info;
+
+    unsigned int kernel_rows, kernel_cols;
+    unsigned int stride_rows, stride_cols;
+
+    unsigned int n_batches, input_rows, input_cols, input_channels;
+    unsigned int output_rows, output_cols;
+    unsigned int channel_multiplier;
+
+    PaddingValues padding;
+
+    arm_gemm::Activation activation;
+
+    const DepthwiseConfig *config;
+
+    DepthwiseArgs(
+        const CPUInfo *cpu_info,
+        unsigned int kernel_rows, unsigned int kernel_cols,
+        unsigned int stride_rows, unsigned int stride_cols,
+        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
+        unsigned int input_channels,
+        unsigned int output_rows, unsigned int output_cols,
+        unsigned int  channel_multiplier,
+        PaddingValues padding, arm_gemm::Activation activation,
+        const DepthwiseConfig *config)
+        : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), stride_rows(stride_rows), stride_cols(stride_cols), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
+          input_channels(input_channels), output_rows(output_rows), output_cols(output_cols), channel_multiplier(channel_multiplier), padding(padding), activation(activation), config(config)
+    {
+    }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthwiseCommon : public IDepthwiseCommon
+{
+protected:
+    const DepthwiseArgs m_args; // Copy of arguments
+
+public:
+    DepthwiseCommon(const DepthwiseArgs &args)
+        : m_args(args) {};
+    DepthwiseCommon(DepthwiseCommon &) = delete;
+    DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
+
+    void execute(
+        const void *const  input,
+        const void *const  parameters,
+        void *const        output,
+        void *const        working_space,
+        const unsigned int thread_id,
+        const unsigned int n_threads) const override
+    {
+        const size_t ld_input_col    = m_args.input_channels;
+        const size_t ld_input_row    = ld_input_col * m_args.input_cols;
+        const size_t ld_input_batch  = ld_input_row * m_args.input_rows;
+        const size_t ld_output_col   = m_args.input_channels * m_args.channel_multiplier;
+        const size_t ld_output_row   = ld_output_col * m_args.output_cols;
+        const size_t ld_output_batch = ld_output_row * m_args.output_rows;
+
+        execute(
+            input, ld_input_col, ld_input_row, ld_input_batch,
+            parameters, output, ld_output_col, ld_output_row, ld_output_batch,
+            working_space, thread_id, n_threads);
+    }
+
+    void execute(
+        const void *const  input,
+        size_t             ld_input_col,
+        size_t             ld_input_row,
+        size_t             ld_input_batch,
+        const void *const  parameters,
+        void *const        output,
+        size_t             ld_output_col,
+        size_t             ld_output_row,
+        size_t             ld_output_batch,
+        void *const        working_space,
+        const unsigned int thread_id,
+        const unsigned int n_threads) const override
+    {
+        execute(
+            m_args.n_batches, m_args.input_rows, m_args.input_cols,
+            m_args.input_channels, m_args.padding,
+            input, ld_input_col, ld_input_row, ld_input_batch,
+            parameters,
+            m_args.output_rows, m_args.output_cols,
+            output, ld_output_col, ld_output_row, ld_output_batch,
+            working_space, thread_id, n_threads);
+    }
+
+    virtual void execute(
+        unsigned int batches,
+        unsigned int input_height,
+        unsigned int input_width,
+        unsigned int channels,
+        const PaddingValues &,
+        const void *input,
+        size_t       ld_input_col,
+        size_t       ld_input_row,
+        size_t       ld_input_batch,
+        const void *parameters,
+        unsigned int output_height,
+        unsigned int output_width,
+        void        *output,
+        size_t       ld_output_col,
+        size_t       ld_output_row,
+        size_t       ld_output_batch,
+        void        *working_space,
+        unsigned int thread_id,
+        unsigned int n_threads) const override = 0;
+};
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput>
+using UniqueDepthwiseCommon = std::unique_ptr<DepthwiseCommon<TInput, TWeight, TOutput>>;
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+KernelDescription get_depthwise_method(const DepthwiseArgs &, const OutputStage & = {});
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &, const OutputStage & = {});
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &, const OutputStage & = {});
+
+} // namespace depthwise
+} // namespace arm_conv

diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
new file mode 100644
index 0000000..52963ab
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp

@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "common.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+using arm_gemm::Nothing;
+
+enum class DepthwiseMethod
+{
+    DEFAULT,
+    DEPTHFIRST,
+    PLANAR,
+};
+
+struct KernelDescription
+{
+    DepthwiseMethod method         = DepthwiseMethod::DEFAULT;
+    std::string     name           = "";
+    bool            is_default     = false;
+    uint64_t        cycle_estimate = 0;
+
+    KernelDescription(
+        DepthwiseMethod method,
+        std::string     name,
+        bool            is_default,
+        uint64_t        cycle_estimate)
+        : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
+    {
+    }
+
+    KernelDescription() noexcept {};
+};
+
+class IDepthwiseCommon
+{
+public:
+    virtual ~IDepthwiseCommon() = default;
+
+    // Determine the amount of storage space required for the rearranged weights
+    // and bias.
+    virtual size_t get_storage_size(void) const = 0;
+
+    // Rearrange the weights and biases into a storage buffer.
+    // Accepts a pointer to a buffer into which to store the packed parameters, a
+    // pointer the bias vector (which may be nullptr in the case of no bias) and
+    // a pointer to the array of weights (stored in HWIO order).
+    virtual void pack_parameters(
+        void       *buffer,
+        const void *biases,
+        const void *weights,
+        size_t      ld_weight_col = 0,
+        size_t      ld_weight_row = 0) = 0;
+
+    // Determine the amount of working space required
+    virtual size_t get_working_size(unsigned int n_threads, unsigned int n_input_channels) const = 0;
+
+    // Execute the convolution over the specified area of memory.
+    virtual void execute(
+        const void *input,       // Pointer to input tensor
+        const void *parameters,  // Packed parameters buffer
+        void        *output,
+        void        *working_space,
+        unsigned int thread_id,
+        unsigned int n_threads) const = 0;
+
+    virtual void execute(
+        const void *input,
+        size_t       ld_input_col,
+        size_t       ld_input_row,
+        size_t       ld_input_batch,
+        const void *parameters,
+        void        *output,
+        size_t       ld_output_col,
+        size_t       ld_output_row,
+        size_t       ld_output_batch,
+        void        *working_space,
+        unsigned int thread_id,
+        unsigned int n_threads) const = 0;
+
+    virtual void execute(
+        unsigned int batches,
+        unsigned int input_height,
+        unsigned int input_width,
+        unsigned int channels,
+        const PaddingValues &,
+        const void *input,
+        size_t       ld_input_col,
+        size_t       ld_input_row,
+        size_t       ld_input_batch,
+        const void *parameters,
+        unsigned int output_height,
+        unsigned int output_width,
+        void        *output,
+        size_t       ld_output_col,
+        size_t       ld_output_row,
+        size_t       ld_output_batch,
+        void        *working_space,
+        unsigned int thread_id,
+        unsigned int n_threads) const = 0;
+};
+
+} // namespace depthwise
+} // namespace arm_conv

diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index fdc18ae..b6a0a0a 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp

@@ -24,9 +24,7 @@
 
 #pragma once
 
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif // CYCLE_PROFILING
+#include "common.hpp"
 
 namespace arm_conv
 {
@@ -55,11 +53,6 @@
     unsigned int rows, cols;
 };
 
-struct PaddingValues
-{
-    unsigned int left, top, right, bottom;
-};
-
 class IPoolingCommon
 {
 public:

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
deleted file mode 100644
index 70d6689..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ /dev/null

@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <arm_neon.h>
-#include "activation.hpp"
-#include "padding.hpp"
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-class IDepthwiseConvolution
-{
-  public:
-    virtual ~IDepthwiseConvolution() = default;
-
-    virtual int output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after
-    ) const = 0;
-
-    /* Set input tensor and stride. */
-    virtual void set_input(const void *inptr) = 0;
-    virtual void set_input(const void *inptr, int column_stride) = 0;
-    virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
-    virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
-
-    /* Set output tensor and stride. */
-    virtual void set_output(void *outptr) = 0;
-    virtual void set_output(void *outptr, int column_stride) = 0;
-    virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
-    virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
-
-    /* Weights and biases are re-ordered to improve memory access patterns. Use
-     * these methods to determine the size of the re-pack buffer and to set the
-     * address (and implicitly reorder the weights and biases into) the buffer.
-     */
-    virtual size_t get_packed_params_size(void) const = 0;
-    virtual void set_packed_params_buffer(void *) = 0;
-
-    virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
-    virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
-    virtual void pack_params(
-      void *buffer,
-      const void* weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const = 0;
-
-    /* Working space is used to pad tensors on the fly. Before running any
-     * inference check the amount of space required, allocate and provide a
-     * pointer to the convolution engine.
-     */
-    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
-    virtual void set_working_space(void *) = 0;
-
-    virtual unsigned int get_window(void) const = 0;
-    virtual void run(
-      unsigned int start,
-      unsigned int stop,
-      unsigned int threadid=0
-    ) = 0;
-};
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut,
-  typename Derived
->
-class DepthwiseConvolutionBase : public IDepthwiseConvolution
-{
-  public:
-    // Information about the specific convolution instance
-    using InputType = TIn;
-    using BiasType = TBias;
-    using OutputType = TOut;
-    static constexpr int output_tile_rows = OutputTileRows;
-    static constexpr int output_tile_cols = OutputTileCols;
-    static constexpr int kernel_rows = KernelRows;
-    static constexpr int kernel_cols = KernelCols;
-    static constexpr int stride_rows = StrideRows;
-    static constexpr int stride_cols = StrideCols;
-    static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows;
-    static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols;
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in] n_batches Number of batches tensors.
-     * @param[in] n_input_rows Number of rows in input tensor.
-     * @param[in] n_input_cols Number of columns in input tensor.
-     * @param[in] n_channels Number of channels in input and output tensors.
-     */
-    DepthwiseConvolutionBase(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in] n_batches Number of batches tensors.
-     * @param[in] n_input_rows Number of rows in input tensor.
-     * @param[in] n_input_cols Number of columns in input tensor.
-     * @param[in] n_channels Number of channels in input and output tensors.
-     */
-    DepthwiseConvolutionBase(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    // Cannot copy or move a DepthwiseConvolution.
-    DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
-    DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
-
-    /* Set input tensor and stride. */
-    void set_input(const void *inptr) override;
-    void set_input(const void *inptr, int column_stride) override;
-    void set_input(const void *inptr, int row_stride, int column_stride) override;
-    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /* Set output tensor and stride. */
-    void set_output(void *outptr) override;
-    void set_output(void *outptr, int column_stride) override;
-    void set_output(void *outptr, int row_stride, int column_stride) override;
-    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /** Get the number of output rows/columns.
-     *
-     * @param[in] dim_size Number of elements in the dimension (rows/columns)
-     * @param[in] same_padding True if the padding is SAME, otherwise false.
-     */
-    static int get_output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    );
-
-    int output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    ) const override;
-
-    /* Determine how much memory is required to store the packed weights and
-     * biases.
-     */
-    size_t get_packed_params_size(void) const override;
-
-    /* Set the buffer for the packed weights and biases, and perform the
-     * packing.
-     */
-    void set_packed_params_buffer(void *buffer) override;
-
-    void pack_params(const void *weights, const void *biases=nullptr) const override;
-
-    void pack_params(
-      void *buffer,
-      const void *weights,
-      const void *biases=nullptr
-    ) const override;
-
-    void pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const override;
-
-    /** Query the amount of working space required.
-     * @param[in] The largest number of threads which will be used to execute
-     *            the kernel.
-     */
-    size_t get_working_space_size(unsigned int n_threads=1) const override;
-
-    /** Set the working space buffer.
-     */
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work to be performed by an instance of the operator.
-     */
-    unsigned int get_window(void) const override;
-
-    /** Perform a portion of the work associated with the operator.
-     *
-     * Will perform the window of work described by $[start, stop)$.
-     *
-     * @param[in] start Start of the window of work to perform.
-     * @param[in] stop End of the work to perform.
-     * @param[in] ID of the thread performing the work.
-     */
-    void run(
-      unsigned int start,
-      unsigned int stop,
-      unsigned int threadid=0
-    ) override;
-
-  protected:
-    /** Get the value to use to pad the tensor.
-     */
-    TIn _input_padding_value(void) const;
-
-    /** Implementation of the parameter packing.
-     */
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    /** Process a tile-row of the tensors.
-     */
-    void process_tile_row(
-      unsigned int threadid,
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      OutputType* outptr,
-      int row_pad_in_top,
-      int row_pad_in_left,
-      int row_pad_in_bottom,
-      int row_pad_out_bottom,
-      int n_tiles,
-      int n_input_cols,
-      int n_output_cols
-    );
-
-    /** Process a single tile of the tensor.
-     *
-     * This method will apply input/output padding (if required) and call the
-     * depthwise tile implementation.
-     */
-    void process_tile(
-      unsigned int threadid,
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      OutputType* outptr,
-      int pad_in_top,
-      int pad_in_left,
-      int pad_in_bottom,
-      int pad_in_right,
-      int pad_out_bottom,
-      int pad_out_right
-    );
-
-    /** Perform depthwise convolution on a single tile.
-     */
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      OutputType* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptrs[inner_tile_rows][inner_tile_cols],
-      OutputType* outptrs[output_tile_rows][output_tile_cols]
-    );
-
-    int n_channels(void) const;
-
-  private:
-    // Member variables of instances of a convolution engine.
-    const InputType* _input;
-    OutputType* _output;
-    void* _packed_parameters;
-    void* _working_space;  // Per-thread working space
-    const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
-              _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
-    const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
-    const nck::ActivationFunction _activation;
-
-    // Stride information for a convolution instance
-    int _input_col_stride, _input_row_stride, _input_batch_stride;
-    int _output_col_stride, _output_row_stride, _output_batch_stride;
-
-    // Methods for getting access to working space
-    size_t _get_input_working_space_size(void) const;
-    size_t _get_output_working_space_size(void) const;
-
-    void *_get_input_working_space(unsigned int threadid) const;
-    void *_get_output_working_space(unsigned int threadid) const;
-};
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut
->
-class DepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  TIn, TBias, TOut,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TBias, TOut
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TBias, TOut,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      TIn, TBias, TOut
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    using Base::DepthwiseConvolutionBase;
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const TIn* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      TOut* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float, float, float
-> : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float, float, float,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float, float
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float, float,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      float, float, float
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      float* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      float* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
-> : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float16_t, float16_t, float16_t,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t, float16_t
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t, float16_t,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      float16_t, float16_t, float16_t
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float16_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      float16_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 864c6e2..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ /dev/null

@@ -1,1168 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x26, %[inptr0], %[input_row_stride]\n"
-    "add x21, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x23, %[outptr0], %[output_row_stride]\n"
-    "add x27, x26, %[input_row_stride]\n"
-    "add x22, x21, %[input_col_stride1]\n"
-    "and x24, %[n_channels], #3\n"
-    "add x28, x27, %[input_row_stride]\n"
-    "lsr x25, %[n_channels], #2\n"
-    "cbz x25, 4f\n"
-    "1:\n"
-    "ldr q15, [%[wbptr]]\n"
-    "subs x25, x25, #1\n"
-    "mov v3.16b, v15.16b\n"
-    "ldr q14, [%[wbptr], #16]\n"
-    "mov v1.16b, v15.16b\n"
-    "ldr q13, [%[wbptr], #32]\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr q12, [%[wbptr], #48]\n"
-    "mov v0.16b, v15.16b\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "ldr q9, [%[wbptr], #96]\n"
-    "ldr q8, [%[wbptr], #112]\n"
-    "ldr q7, [%[wbptr], #128]\n"
-    "ldr q6, [%[wbptr], #144]\n"
-    "ldr q24, [%[inptr0]]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "ldr q22, [x26]\n"
-    "fmla v1.4s, v22.4s, v14.4s\n"
-    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v19.4s, v14.4s\n"
-    "ldr q18, [x27]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "ldr q21, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v18.4s, v11.4s\n"
-    "ldr q17, [%[inptr0], x21]\n"
-    "ldr q20, [x28]\n"
-    "ldr q5, [x27, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v13.4s\n"
-    "fmla v3.4s, v18.4s, v8.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v3.4s, v21.4s, v10.4s\n"
-    "ldr q19, [x26, x21]\n"
-    "fmla v1.4s, v21.4s, v13.4s\n"
-    "ldr q23, [%[inptr0], x22]\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "ldr q22, [x28, %[input_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v14.4s\n"
-    "ldr q21, [x27, x21]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr q18, [x26, x22]\n"
-    "fmla v2.4s, v17.4s, v13.4s\n"
-    "ldr q16, [x28, x21]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "ldr q20, [x27, x22]\n"
-    "fmla v3.4s, v5.4s, v7.4s\n"
-    "ldr q4, [x28, x22]\n"
-    "fmla v2.4s, v5.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v1.4s, v5.4s, v10.4s\n"
-    "ldr q15, [%[wbptr]]\n"
-    "fmla v0.4s, v5.4s, v11.4s\n"
-    "ldr q14, [%[wbptr], #16]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v1.4s, v19.4s, v12.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v2.4s, v19.4s, v10.4s\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "fmla v0.4s, v19.4s, v13.4s\n"
-    "ldr q24, [%[inptr0]]\n"
-    "fmla v1.4s, v22.4s, v7.4s\n"
-    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "ldr q17, [%[inptr0], x21]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "ldr q13, [%[wbptr], #32]\n"
-    "fmla v3.4s, v21.4s, v6.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v1.4s, v21.4s, v9.4s\n"
-    "ldr q22, [x26]\n"
-    "fmla v2.4s, v21.4s, v7.4s\n"
-    "ldr q8, [%[wbptr], #112]\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v0.4s, v21.4s, v10.4s\n"
-    "fmla v1.4s, v16.4s, v6.4s\n"
-    "ldr q21, [x26, %[input_col_stride1]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v0.4s, v18.4s, v12.4s\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "str q1, [x23]\n"
-    "mov v3.16b, v15.16b\n"
-    "fmla v2.4s, v20.4s, v6.4s\n"
-    "ldr q18, [x27]\n"
-    "fmla v0.4s, v16.4s, v7.4s\n"
-    "ldr q12, [%[wbptr], #48]\n"
-    "mov v1.16b, v15.16b\n"
-    "ldr q5, [x27, %[input_col_stride1]]\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "fmla v0.4s, v20.4s, v9.4s\n"
-    "ldr q7, [%[wbptr], #128]\n"
-    "mov v2.16b, v15.16b\n"
-    "add x28, x28, #16\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "ldr q20, [x28]\n"
-    "fmla v0.4s, v4.4s, v6.4s\n"
-    "ldr q9, [%[wbptr], #96]\n"
-    "fmla v1.4s, v22.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v3.4s, v19.4s, v13.4s\n"
-    "subs x25, x25, #1\n"
-    "str q0, [x23, %[output_col_stride1]]\n"
-    "fmla v2.4s, v19.4s, v14.4s\n"
-    "ldr q6, [%[wbptr], #144]\n"
-    "add x23, x23, #16\n"
-    "fmla v3.4s, v18.4s, v8.4s\n"
-    "fmla v1.4s, v18.4s, v11.4s\n"
-    "mov v0.16b, v15.16b\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v3.4s, v21.4s, v10.4s\n"
-    "ldr q19, [x26, x21]\n"
-    "fmla v1.4s, v21.4s, v13.4s\n"
-    "ldr q23, [%[inptr0], x22]\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "ldr q22, [x28, %[input_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v14.4s\n"
-    "ldr q21, [x27, x21]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr q18, [x26, x22]\n"
-    "fmla v2.4s, v17.4s, v13.4s\n"
-    "ldr q16, [x28, x21]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "ldr q20, [x27, x22]\n"
-    "fmla v3.4s, v5.4s, v7.4s\n"
-    "ldr q4, [x28, x22]\n"
-    "fmla v2.4s, v5.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v1.4s, v5.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v5.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v1.4s, v19.4s, v12.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v2.4s, v19.4s, v10.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v0.4s, v19.4s, v13.4s\n"
-    "fmla v3.4s, v21.4s, v6.4s\n"
-    "fmla v1.4s, v22.4s, v7.4s\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "fmla v1.4s, v21.4s, v9.4s\n"
-    "fmla v2.4s, v21.4s, v7.4s\n"
-    "fmla v0.4s, v21.4s, v10.4s\n"
-    "fmla v1.4s, v16.4s, v6.4s\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "fmla v0.4s, v18.4s, v12.4s\n"
-    "str q1, [x23]\n"
-    "fmla v2.4s, v20.4s, v6.4s\n"
-    "fmla v0.4s, v16.4s, v7.4s\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v20.4s, v9.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v0.4s, v4.4s, v6.4s\n"
-    "str q0, [x23, %[output_col_stride1]]\n"
-    "add x23, x23, #16\n"
-    "4:\n"
-    "cbz x24, 7f\n"
-    "ldr s15, [%[wbptr]]\n"
-    "mov v3.16b, v15.16b\n"
-    "ldr s14, [%[wbptr], #4]\n"
-    "mov v1.16b, v15.16b\n"
-    "ldr s13, [%[wbptr], #8]\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr s12, [%[wbptr], #12]\n"
-    "mov v0.16b, v15.16b\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "subs x24, x24, #1\n"
-    "ldr s9, [%[wbptr], #24]\n"
-    "ldr s8, [%[wbptr], #28]\n"
-    "ldr s7, [%[wbptr], #32]\n"
-    "ldr s6, [%[wbptr], #36]\n"
-    "ldr s24, [%[inptr0]]\n"
-    "ldr s22, [x26]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v1.4s, v22.4s, v14.4s\n"
-    "ldr s18, [x27]\n"
-    "fmla v2.4s, v19.4s, v14.4s\n"
-    "ldr s21, [x26, %[input_col_stride1]]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "ldr s17, [%[inptr0], x21]\n"
-    "fmla v1.4s, v18.4s, v11.4s\n"
-    "ldr s20, [x28]\n"
-    "ldr s5, [x27, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v13.4s\n"
-    "fmla v3.4s, v18.4s, v8.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v3.4s, v21.4s, v10.4s\n"
-    "ldr s19, [x26, x21]\n"
-    "fmla v1.4s, v21.4s, v13.4s\n"
-    "ldr s23, [%[inptr0], x22]\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "ldr s22, [x28, %[input_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v14.4s\n"
-    "ldr s21, [x27, x21]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr s18, [x26, x22]\n"
-    "fmla v2.4s, v17.4s, v13.4s\n"
-    "ldr s16, [x28, x21]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "ldr s20, [x27, x22]\n"
-    "fmla v3.4s, v5.4s, v7.4s\n"
-    "ldr s4, [x28, x22]\n"
-    "fmla v2.4s, v5.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v1.4s, v5.4s, v10.4s\n"
-    "ldr s15, [%[wbptr]]\n"
-    "fmla v0.4s, v5.4s, v11.4s\n"
-    "ldr s14, [%[wbptr], #4]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v1.4s, v19.4s, v12.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v2.4s, v19.4s, v10.4s\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "fmla v0.4s, v19.4s, v13.4s\n"
-    "ldr s24, [%[inptr0]]\n"
-    "fmla v1.4s, v22.4s, v7.4s\n"
-    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "ldr s17, [%[inptr0], x21]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "ldr s13, [%[wbptr], #8]\n"
-    "fmla v3.4s, v21.4s, v6.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v1.4s, v21.4s, v9.4s\n"
-    "ldr s22, [x26]\n"
-    "fmla v2.4s, v21.4s, v7.4s\n"
-    "ldr s8, [%[wbptr], #28]\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v0.4s, v21.4s, v10.4s\n"
-    "fmla v1.4s, v16.4s, v6.4s\n"
-    "ldr s21, [x26, %[input_col_stride1]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v0.4s, v18.4s, v12.4s\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "str s1, [x23]\n"
-    "mov v3.16b, v15.16b\n"
-    "fmla v2.4s, v20.4s, v6.4s\n"
-    "ldr s18, [x27]\n"
-    "fmla v0.4s, v16.4s, v7.4s\n"
-    "ldr s12, [%[wbptr], #12]\n"
-    "mov v1.16b, v15.16b\n"
-    "ldr s5, [x27, %[input_col_stride1]]\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "fmla v0.4s, v20.4s, v9.4s\n"
-    "ldr s7, [%[wbptr], #32]\n"
-    "mov v2.16b, v15.16b\n"
-    "add x28, x28, #4\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "ldr s20, [x28]\n"
-    "fmla v0.4s, v4.4s, v6.4s\n"
-    "ldr s9, [%[wbptr], #24]\n"
-    "fmla v1.4s, v22.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v3.4s, v19.4s, v13.4s\n"
-    "subs x24, x24, #1\n"
-    "str s0, [x23, %[output_col_stride1]]\n"
-    "fmla v2.4s, v19.4s, v14.4s\n"
-    "ldr s6, [%[wbptr], #36]\n"
-    "add x23, x23, #4\n"
-    "fmla v3.4s, v18.4s, v8.4s\n"
-    "fmla v1.4s, v18.4s, v11.4s\n"
-    "mov v0.16b, v15.16b\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v3.4s, v21.4s, v10.4s\n"
-    "ldr s19, [x26, x21]\n"
-    "fmla v1.4s, v21.4s, v13.4s\n"
-    "ldr s23, [%[inptr0], x22]\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "ldr s22, [x28, %[input_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v14.4s\n"
-    "ldr s21, [x27, x21]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr s18, [x26, x22]\n"
-    "fmla v2.4s, v17.4s, v13.4s\n"
-    "ldr s16, [x28, x21]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "ldr s20, [x27, x22]\n"
-    "fmla v3.4s, v5.4s, v7.4s\n"
-    "ldr s4, [x28, x22]\n"
-    "fmla v2.4s, v5.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v1.4s, v5.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v5.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v1.4s, v19.4s, v12.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v2.4s, v19.4s, v10.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v0.4s, v19.4s, v13.4s\n"
-    "fmla v3.4s, v21.4s, v6.4s\n"
-    "fmla v1.4s, v22.4s, v7.4s\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "fmla v1.4s, v21.4s, v9.4s\n"
-    "fmla v2.4s, v21.4s, v7.4s\n"
-    "fmla v0.4s, v21.4s, v10.4s\n"
-    "fmla v1.4s, v16.4s, v6.4s\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "fmla v0.4s, v18.4s, v12.4s\n"
-    "str s1, [x23]\n"
-    "fmla v2.4s, v20.4s, v6.4s\n"
-    "fmla v0.4s, v16.4s, v7.4s\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v20.4s, v9.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v0.4s, v4.4s, v6.4s\n"
-    "str s0, [x23, %[output_col_stride1]]\n"
-    "add x23, x23, #4\n"
-    "7:\n"
-    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x21, %[inptr0], %[input_row_stride]\n"
-    "add x24, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x22, %[outptr0], %[output_row_stride]\n"
-    "add x23, x21, %[input_row_stride]\n"
-    "add x27, x24, %[input_col_stride1]\n"
-    "and x25, %[n_channels], #3\n"
-    "add x28, x23, %[input_row_stride]\n"
-    "lsr x26, %[n_channels], #2\n"
-    "cbz x26, 4f\n"
-    "1:\n"
-    "ldr q11, [%[wbptr]]\n"
-    "subs x26, x26, #1\n"
-    "mov v17.16b, v11.16b\n"
-    "ldr q13, [%[wbptr], #16]\n"
-    "mov v15.16b, v11.16b\n"
-    "ldr q4, [%[wbptr], #32]\n"
-    "mov v16.16b, v11.16b\n"
-    "ldr q2, [%[wbptr], #48]\n"
-    "mov v14.16b, v11.16b\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "ldr q1, [%[wbptr], #96]\n"
-    "ldr q12, [%[wbptr], #112]\n"
-    "ldr q0, [%[wbptr], #128]\n"
-    "ldr q3, [%[wbptr], #144]\n"
-    "ldr q6, [%[inptr0]]\n"
-    "fmla v17.4s, v6.4s, v13.4s\n"
-    "ldr q27, [x21]\n"
-    "fmla v15.4s, v27.4s, v13.4s\n"
-    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "ldr q24, [x23]\n"
-    "fmla v17.4s, v27.4s, v5.4s\n"
-    "ldr q22, [x21, %[input_col_stride1]]\n"
-    "ldr q9, [%[inptr0], x24]\n"
-    "ldr q8, [x28]\n"
-    "ldr q20, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v17.4s, v24.4s, v12.4s\n"
-    "ldr q26, [x21, x24]\n"
-    "fmla v15.4s, v24.4s, v5.4s\n"
-    "ldr q27, [%[inptr0], x27]\n"
-    "fmla v16.4s, v22.4s, v5.4s\n"
-    "ldr q25, [x28, %[input_col_stride1]]\n"
-    "fmla v17.4s, v22.4s, v10.4s\n"
-    "ldr q24, [x23, x24]\n"
-    "fmla v15.4s, v22.4s, v4.4s\n"
-    "ldr q21, [x21, x27]\n"
-    "fmla v14.4s, v22.4s, v13.4s\n"
-    "ldr q7, [x28, x24]\n"
-    "fmla v17.4s, v9.4s, v2.4s\n"
-    "ldr q19, [x23, x27]\n"
-    "fmla v16.4s, v9.4s, v4.4s\n"
-    "ldr q18, [x28, x27]\n"
-    "fmla v15.4s, v8.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v20.4s, v0.4s\n"
-    "ldr q11, [%[wbptr]]\n"
-    "fmla v16.4s, v20.4s, v12.4s\n"
-    "ldr q13, [%[wbptr], #16]\n"
-    "fmla v15.4s, v20.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v14.4s, v20.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v17.4s, v26.4s, v1.4s\n"
-    "ldr q6, [%[inptr0]]\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr q9, [%[inptr0], x24]\n"
-    "fmla v15.4s, v25.4s, v0.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v16.4s, v27.4s, v2.4s\n"
-    "ldr q27, [x21]\n"
-    "fmla v14.4s, v25.4s, v12.4s\n"
-    "ldr q4, [%[wbptr], #32]\n"
-    "fmla v17.4s, v24.4s, v3.4s\n"
-    "ldr q22, [x21, %[input_col_stride1]]\n"
-    "fmla v15.4s, v24.4s, v1.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v16.4s, v24.4s, v0.4s\n"
-    "ldr q12, [%[wbptr], #112]\n"
-    "fmla v14.4s, v24.4s, v10.4s\n"
-    "ldr q24, [x23]\n"
-    "fmla v15.4s, v7.4s, v3.4s\n"
-    "ldr q20, [x23, %[input_col_stride1]]\n"
-    "fmla v16.4s, v21.4s, v1.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v14.4s, v21.4s, v2.4s\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "movi v26.16b, #0\n"
-    "ldr q8, [x28]\n"
-    "fmla v16.4s, v19.4s, v3.4s\n"
-    "subs x26, x26, #1\n"
-    "fmla v14.4s, v7.4s, v0.4s\n"
-    "ldr q2, [%[wbptr], #48]\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str q17, [%[outptr0]]\n"
-    "str q16, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v14.4s, v19.4s, v1.4s\n"
-    "str q15, [x22]\n"
-    "mov v17.16b, v11.16b\n"
-    "mov v15.16b, v11.16b\n"
-    "ldr q0, [%[wbptr], #128]\n"
-    "fmla v14.4s, v18.4s, v3.4s\n"
-    "ldr q1, [%[wbptr], #96]\n"
-    "mov v16.16b, v11.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v17.4s, v6.4s, v13.4s\n"
-    "fmla v15.4s, v27.4s, v13.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "ldr q3, [%[wbptr], #144]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "str q14, [x22, %[output_col_stride1]]\n"
-    "mov v14.16b, v11.16b\n"
-    "add x22, x22, #16\n"
-    "fmla v17.4s, v27.4s, v5.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v17.4s, v24.4s, v12.4s\n"
-    "ldr q26, [x21, x24]\n"
-    "fmla v15.4s, v24.4s, v5.4s\n"
-    "ldr q27, [%[inptr0], x27]\n"
-    "fmla v16.4s, v22.4s, v5.4s\n"
-    "ldr q25, [x28, %[input_col_stride1]]\n"
-    "fmla v17.4s, v22.4s, v10.4s\n"
-    "ldr q24, [x23, x24]\n"
-    "fmla v15.4s, v22.4s, v4.4s\n"
-    "ldr q21, [x21, x27]\n"
-    "fmla v14.4s, v22.4s, v13.4s\n"
-    "ldr q7, [x28, x24]\n"
-    "fmla v17.4s, v9.4s, v2.4s\n"
-    "ldr q19, [x23, x27]\n"
-    "fmla v16.4s, v9.4s, v4.4s\n"
-    "ldr q18, [x28, x27]\n"
-    "fmla v15.4s, v8.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v20.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v20.4s, v12.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v15.4s, v20.4s, v10.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v14.4s, v20.4s, v5.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v17.4s, v26.4s, v1.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v17.4s, v24.4s, v3.4s\n"
-    "fmla v16.4s, v27.4s, v2.4s\n"
-    "fmla v15.4s, v25.4s, v0.4s\n"
-    "fmla v14.4s, v25.4s, v12.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmla v16.4s, v24.4s, v0.4s\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v15.4s, v24.4s, v1.4s\n"
-    "fmla v14.4s, v24.4s, v10.4s\n"
-    "fmla v16.4s, v21.4s, v1.4s\n"
-    "fmla v15.4s, v7.4s, v3.4s\n"
-    "fmla v14.4s, v21.4s, v2.4s\n"
-    "fmla v16.4s, v19.4s, v3.4s\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "fmla v14.4s, v7.4s, v0.4s\n"
-    "str q15, [x22]\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "fmla v14.4s, v19.4s, v1.4s\n"
-    "str q16, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v14.4s, v18.4s, v3.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "str q14, [x22, %[output_col_stride1]]\n"
-    "add x22, x22, #16\n"
-    "4:\n"
-    "cbz x25, 7f\n"
-    "ldr s11, [%[wbptr]]\n"
-    "mov v17.16b, v11.16b\n"
-    "ldr s13, [%[wbptr], #4]\n"
-    "mov v15.16b, v11.16b\n"
-    "ldr s4, [%[wbptr], #8]\n"
-    "mov v16.16b, v11.16b\n"
-    "ldr s2, [%[wbptr], #12]\n"
-    "mov v14.16b, v11.16b\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "subs x25, x25, #1\n"
-    "ldr s1, [%[wbptr], #24]\n"
-    "ldr s12, [%[wbptr], #28]\n"
-    "ldr s0, [%[wbptr], #32]\n"
-    "ldr s3, [%[wbptr], #36]\n"
-    "ldr s6, [%[inptr0]]\n"
-    "ldr s27, [x21]\n"
-    "fmla v17.4s, v6.4s, v13.4s\n"
-    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v15.4s, v27.4s, v13.4s\n"
-    "ldr s24, [x23]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "ldr s22, [x21, %[input_col_stride1]]\n"
-    "fmla v17.4s, v27.4s, v5.4s\n"
-    "ldr s9, [%[inptr0], x24]\n"
-    "ldr s8, [x28]\n"
-    "ldr s20, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v17.4s, v24.4s, v12.4s\n"
-    "ldr s26, [x21, x24]\n"
-    "fmla v15.4s, v24.4s, v5.4s\n"
-    "ldr s27, [%[inptr0], x27]\n"
-    "fmla v16.4s, v22.4s, v5.4s\n"
-    "ldr s25, [x28, %[input_col_stride1]]\n"
-    "fmla v17.4s, v22.4s, v10.4s\n"
-    "ldr s24, [x23, x24]\n"
-    "fmla v15.4s, v22.4s, v4.4s\n"
-    "ldr s21, [x21, x27]\n"
-    "fmla v14.4s, v22.4s, v13.4s\n"
-    "ldr s7, [x28, x24]\n"
-    "fmla v17.4s, v9.4s, v2.4s\n"
-    "ldr s19, [x23, x27]\n"
-    "fmla v16.4s, v9.4s, v4.4s\n"
-    "ldr s18, [x28, x27]\n"
-    "fmla v15.4s, v8.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v20.4s, v0.4s\n"
-    "ldr s11, [%[wbptr]]\n"
-    "fmla v16.4s, v20.4s, v12.4s\n"
-    "ldr s13, [%[wbptr], #4]\n"
-    "fmla v15.4s, v20.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v14.4s, v20.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v17.4s, v26.4s, v1.4s\n"
-    "ldr s6, [%[inptr0]]\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr s9, [%[inptr0], x24]\n"
-    "fmla v15.4s, v25.4s, v0.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v16.4s, v27.4s, v2.4s\n"
-    "ldr s27, [x21]\n"
-    "fmla v14.4s, v25.4s, v12.4s\n"
-    "ldr s4, [%[wbptr], #8]\n"
-    "fmla v17.4s, v24.4s, v3.4s\n"
-    "ldr s22, [x21, %[input_col_stride1]]\n"
-    "fmla v15.4s, v24.4s, v1.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v16.4s, v24.4s, v0.4s\n"
-    "ldr s12, [%[wbptr], #28]\n"
-    "fmla v14.4s, v24.4s, v10.4s\n"
-    "ldr s24, [x23]\n"
-    "fmla v15.4s, v7.4s, v3.4s\n"
-    "ldr s20, [x23, %[input_col_stride1]]\n"
-    "fmla v16.4s, v21.4s, v1.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v14.4s, v21.4s, v2.4s\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "movi v26.16b, #0\n"
-    "ldr s8, [x28]\n"
-    "fmla v16.4s, v19.4s, v3.4s\n"
-    "subs x25, x25, #1\n"
-    "fmla v14.4s, v7.4s, v0.4s\n"
-    "ldr s2, [%[wbptr], #12]\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str s17, [%[outptr0]]\n"
-    "str s16, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v14.4s, v19.4s, v1.4s\n"
-    "str s15, [x22]\n"
-    "mov v17.16b, v11.16b\n"
-    "mov v15.16b, v11.16b\n"
-    "ldr s0, [%[wbptr], #32]\n"
-    "fmla v14.4s, v18.4s, v3.4s\n"
-    "ldr s1, [%[wbptr], #24]\n"
-    "mov v16.16b, v11.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v17.4s, v6.4s, v13.4s\n"
-    "fmla v15.4s, v27.4s, v13.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "ldr s3, [%[wbptr], #36]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "str s14, [x22, %[output_col_stride1]]\n"
-    "mov v14.16b, v11.16b\n"
-    "add x22, x22, #4\n"
-    "fmla v17.4s, v27.4s, v5.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v17.4s, v24.4s, v12.4s\n"
-    "ldr s26, [x21, x24]\n"
-    "fmla v15.4s, v24.4s, v5.4s\n"
-    "ldr s27, [%[inptr0], x27]\n"
-    "fmla v16.4s, v22.4s, v5.4s\n"
-    "ldr s25, [x28, %[input_col_stride1]]\n"
-    "fmla v17.4s, v22.4s, v10.4s\n"
-    "ldr s24, [x23, x24]\n"
-    "fmla v15.4s, v22.4s, v4.4s\n"
-    "ldr s21, [x21, x27]\n"
-    "fmla v14.4s, v22.4s, v13.4s\n"
-    "ldr s7, [x28, x24]\n"
-    "fmla v17.4s, v9.4s, v2.4s\n"
-    "ldr s19, [x23, x27]\n"
-    "fmla v16.4s, v9.4s, v4.4s\n"
-    "ldr s18, [x28, x27]\n"
-    "fmla v15.4s, v8.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v20.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v20.4s, v12.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v15.4s, v20.4s, v10.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v14.4s, v20.4s, v5.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v17.4s, v26.4s, v1.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v17.4s, v24.4s, v3.4s\n"
-    "fmla v16.4s, v27.4s, v2.4s\n"
-    "fmla v15.4s, v25.4s, v0.4s\n"
-    "fmla v14.4s, v25.4s, v12.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmla v16.4s, v24.4s, v0.4s\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v15.4s, v24.4s, v1.4s\n"
-    "fmla v14.4s, v24.4s, v10.4s\n"
-    "fmla v16.4s, v21.4s, v1.4s\n"
-    "fmla v15.4s, v7.4s, v3.4s\n"
-    "fmla v14.4s, v21.4s, v2.4s\n"
-    "fmla v16.4s, v19.4s, v3.4s\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "fmla v14.4s, v7.4s, v0.4s\n"
-    "str s15, [x22]\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "fmla v14.4s, v19.4s, v1.4s\n"
-    "str s16, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v14.4s, v18.4s, v3.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "str s14, [x22, %[output_col_stride1]]\n"
-    "add x22, x22, #4\n"
-    "7:\n"
-    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
-    : [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x21, %[inptr0], %[input_row_stride]\n"
-    "add x23, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x24, %[outptr0], %[output_row_stride]\n"
-    "add x27, x21, %[input_row_stride]\n"
-    "add x22, x23, %[input_col_stride1]\n"
-    "and x25, %[n_channels], #3\n"
-    "add x28, x27, %[input_row_stride]\n"
-    "lsr x26, %[n_channels], #2\n"
-    "cbz x26, 4f\n"
-    "1:\n"
-    "ldr q19, [%[wbptr]]\n"
-    "subs x26, x26, #1\n"
-    "mov v3.16b, v19.16b\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "mov v2.16b, v19.16b\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "mov v0.16b, v19.16b\n"
-    "ldr q13, [%[wbptr], #64]\n"
-    "ldr q23, [%[wbptr], #80]\n"
-    "ldr q15, [%[wbptr], #96]\n"
-    "ldr q20, [%[wbptr], #112]\n"
-    "ldr q21, [%[wbptr], #128]\n"
-    "ldr q14, [%[wbptr], #144]\n"
-    "ldr q16, [%[inptr0]]\n"
-    "fmla v3.4s, v16.4s, v12.4s\n"
-    "ldr q28, [x21]\n"
-    "fmla v1.4s, v28.4s, v12.4s\n"
-    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr q24, [x27]\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "ldr q8, [x21, %[input_col_stride1]]\n"
-    "ldr q9, [%[inptr0], x23]\n"
-    "ldr q18, [x28]\n"
-    "ldr q6, [x27, %[input_col_stride1]]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v3.4s, v24.4s, v20.4s\n"
-    "ldr q25, [x21, x23]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "ldr q28, [%[inptr0], x22]\n"
-    "fmla v2.4s, v8.4s, v13.4s\n"
-    "ldr q24, [x28, %[input_col_stride1]]\n"
-    "fmla v3.4s, v8.4s, v23.4s\n"
-    "ldr q27, [x27, x23]\n"
-    "fmla v1.4s, v8.4s, v11.4s\n"
-    "ldr q7, [x21, x22]\n"
-    "fmla v0.4s, v8.4s, v12.4s\n"
-    "ldr q17, [x28, x23]\n"
-    "fmla v3.4s, v9.4s, v10.4s\n"
-    "ldr q5, [x27, x22]\n"
-    "fmla v2.4s, v9.4s, v11.4s\n"
-    "ldr q4, [x28, x22]\n"
-    "fmla v1.4s, v18.4s, v20.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v3.4s, v6.4s, v21.4s\n"
-    "ldr q19, [%[wbptr]]\n"
-    "fmla v2.4s, v6.4s, v20.4s\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "fmla v1.4s, v6.4s, v23.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v6.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v3.4s, v25.4s, v15.4s\n"
-    "ldr q16, [%[inptr0]]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v25.4s, v23.4s\n"
-    "ldr q13, [%[wbptr], #64]\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "ldr q9, [%[inptr0], x23]\n"
-    "fmla v1.4s, v24.4s, v21.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "ldr q28, [x21]\n"
-    "fmla v0.4s, v24.4s, v20.4s\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "ldr q8, [x21, %[input_col_stride1]]\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v2.4s, v27.4s, v21.4s\n"
-    "ldr q20, [%[wbptr], #112]\n"
-    "fmla v0.4s, v27.4s, v23.4s\n"
-    "ldr q24, [x27]\n"
-    "fmla v1.4s, v17.4s, v14.4s\n"
-    "ldr q6, [x27, %[input_col_stride1]]\n"
-    "fmla v2.4s, v7.4s, v15.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v0.4s, v7.4s, v10.4s\n"
-    "ldr q23, [%[wbptr], #80]\n"
-    "movi v25.16b, #0\n"
-    "ldr q18, [x28]\n"
-    "fmla v2.4s, v5.4s, v14.4s\n"
-    "subs x26, x26, #1\n"
-    "fmla v0.4s, v17.4s, v21.4s\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "fmov v26.4s, #6.0\n"
-    "fmax v3.4s, v3.4s, v25.4s\n"
-    "fmax v2.4s, v2.4s, v25.4s\n"
-    "fmax v1.4s, v1.4s, v25.4s\n"
-    "fmla v0.4s, v5.4s, v15.4s\n"
-    "ldr q21, [%[wbptr], #128]\n"
-    "fmin v3.4s, v3.4s, v26.4s\n"
-    "fmin v2.4s, v2.4s, v26.4s\n"
-    "fmin v1.4s, v1.4s, v26.4s\n"
-    "str q3, [%[outptr0]]\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v4.4s, v14.4s\n"
-    "str q1, [x24]\n"
-    "mov v3.16b, v19.16b\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr q15, [%[wbptr], #96]\n"
-    "fmax v0.4s, v0.4s, v25.4s\n"
-    "ldr q14, [%[wbptr], #144]\n"
-    "mov v2.16b, v19.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmin v0.4s, v0.4s, v26.4s\n"
-    "fmla v3.4s, v16.4s, v12.4s\n"
-    "fmla v1.4s, v28.4s, v12.4s\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "str q0, [x24, %[output_col_stride1]]\n"
-    "mov v0.16b, v19.16b\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v3.4s, v24.4s, v20.4s\n"
-    "ldr q25, [x21, x23]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "ldr q28, [%[inptr0], x22]\n"
-    "fmla v2.4s, v8.4s, v13.4s\n"
-    "ldr q24, [x28, %[input_col_stride1]]\n"
-    "fmla v3.4s, v8.4s, v23.4s\n"
-    "ldr q27, [x27, x23]\n"
-    "fmla v1.4s, v8.4s, v11.4s\n"
-    "ldr q7, [x21, x22]\n"
-    "fmla v0.4s, v8.4s, v12.4s\n"
-    "ldr q17, [x28, x23]\n"
-    "fmla v3.4s, v9.4s, v10.4s\n"
-    "ldr q5, [x27, x22]\n"
-    "fmla v2.4s, v9.4s, v11.4s\n"
-    "ldr q4, [x28, x22]\n"
-    "fmla v1.4s, v18.4s, v20.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v3.4s, v6.4s, v21.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v2.4s, v6.4s, v20.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v1.4s, v6.4s, v23.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v0.4s, v6.4s, v13.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v3.4s, v25.4s, v15.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v23.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "movi v25.16b, #0\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "fmov v26.4s, #6.0\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v21.4s\n"
-    "fmla v0.4s, v24.4s, v20.4s\n"
-    "fmax v3.4s, v3.4s, v25.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v2.4s, v27.4s, v21.4s\n"
-    "fmla v0.4s, v27.4s, v23.4s\n"
-    "fmin v3.4s, v3.4s, v26.4s\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v2.4s, v7.4s, v15.4s\n"
-    "fmla v0.4s, v7.4s, v10.4s\n"
-    "fmla v1.4s, v17.4s, v14.4s\n"
-    "fmla v2.4s, v5.4s, v14.4s\n"
-    "fmla v0.4s, v17.4s, v21.4s\n"
-    "fmax v1.4s, v1.4s, v25.4s\n"
-    "fmax v2.4s, v2.4s, v25.4s\n"
-    "fmla v0.4s, v5.4s, v15.4s\n"
-    "fmin v1.4s, v1.4s, v26.4s\n"
-    "fmin v2.4s, v2.4s, v26.4s\n"
-    "str q1, [x24]\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v4.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmax v0.4s, v0.4s, v25.4s\n"
-    "fmin v0.4s, v0.4s, v26.4s\n"
-    "str q0, [x24, %[output_col_stride1]]\n"
-    "add x24, x24, #16\n"
-    "4:\n"
-    "cbz x25, 7f\n"
-    "ldr s19, [%[wbptr]]\n"
-    "mov v3.16b, v19.16b\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "mov v2.16b, v19.16b\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "mov v0.16b, v19.16b\n"
-    "ldr s13, [%[wbptr], #16]\n"
-    "ldr s23, [%[wbptr], #20]\n"
-    "subs x25, x25, #1\n"
-    "ldr s15, [%[wbptr], #24]\n"
-    "ldr s20, [%[wbptr], #28]\n"
-    "ldr s21, [%[wbptr], #32]\n"
-    "ldr s14, [%[wbptr], #36]\n"
-    "ldr s16, [%[inptr0]]\n"
-    "ldr s28, [x21]\n"
-    "fmla v3.4s, v16.4s, v12.4s\n"
-    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v1.4s, v28.4s, v12.4s\n"
-    "ldr s24, [x27]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr s8, [x21, %[input_col_stride1]]\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "ldr s9, [%[inptr0], x23]\n"
-    "ldr s18, [x28]\n"
-    "ldr s6, [x27, %[input_col_stride1]]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v3.4s, v24.4s, v20.4s\n"
-    "ldr s25, [x21, x23]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "ldr s28, [%[inptr0], x22]\n"
-    "fmla v2.4s, v8.4s, v13.4s\n"
-    "ldr s24, [x28, %[input_col_stride1]]\n"
-    "fmla v3.4s, v8.4s, v23.4s\n"
-    "ldr s27, [x27, x23]\n"
-    "fmla v1.4s, v8.4s, v11.4s\n"
-    "ldr s7, [x21, x22]\n"
-    "fmla v0.4s, v8.4s, v12.4s\n"
-    "ldr s17, [x28, x23]\n"
-    "fmla v3.4s, v9.4s, v10.4s\n"
-    "ldr s5, [x27, x22]\n"
-    "fmla v2.4s, v9.4s, v11.4s\n"
-    "ldr s4, [x28, x22]\n"
-    "fmla v1.4s, v18.4s, v20.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v3.4s, v6.4s, v21.4s\n"
-    "ldr s19, [%[wbptr]]\n"
-    "fmla v2.4s, v6.4s, v20.4s\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "fmla v1.4s, v6.4s, v23.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v6.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v3.4s, v25.4s, v15.4s\n"
-    "ldr s16, [%[inptr0]]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v25.4s, v23.4s\n"
-    "ldr s13, [%[wbptr], #16]\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "ldr s9, [%[inptr0], x23]\n"
-    "fmla v1.4s, v24.4s, v21.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "ldr s28, [x21]\n"
-    "fmla v0.4s, v24.4s, v20.4s\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "ldr s8, [x21, %[input_col_stride1]]\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v2.4s, v27.4s, v21.4s\n"
-    "ldr s20, [%[wbptr], #28]\n"
-    "fmla v0.4s, v27.4s, v23.4s\n"
-    "ldr s24, [x27]\n"
-    "fmla v1.4s, v17.4s, v14.4s\n"
-    "ldr s6, [x27, %[input_col_stride1]]\n"
-    "fmla v2.4s, v7.4s, v15.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v0.4s, v7.4s, v10.4s\n"
-    "ldr s23, [%[wbptr], #20]\n"
-    "movi v25.16b, #0\n"
-    "ldr s18, [x28]\n"
-    "fmla v2.4s, v5.4s, v14.4s\n"
-    "subs x25, x25, #1\n"
-    "fmla v0.4s, v17.4s, v21.4s\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "fmov v26.4s, #6.0\n"
-    "fmax v3.4s, v3.4s, v25.4s\n"
-    "fmax v2.4s, v2.4s, v25.4s\n"
-    "fmax v1.4s, v1.4s, v25.4s\n"
-    "fmla v0.4s, v5.4s, v15.4s\n"
-    "ldr s21, [%[wbptr], #32]\n"
-    "fmin v3.4s, v3.4s, v26.4s\n"
-    "fmin v2.4s, v2.4s, v26.4s\n"
-    "fmin v1.4s, v1.4s, v26.4s\n"
-    "str s3, [%[outptr0]]\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v4.4s, v14.4s\n"
-    "str s1, [x24]\n"
-    "mov v3.16b, v19.16b\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr s15, [%[wbptr], #24]\n"
-    "fmax v0.4s, v0.4s, v25.4s\n"
-    "ldr s14, [%[wbptr], #36]\n"
-    "mov v2.16b, v19.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmin v0.4s, v0.4s, v26.4s\n"
-    "fmla v3.4s, v16.4s, v12.4s\n"
-    "fmla v1.4s, v28.4s, v12.4s\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "str s0, [x24, %[output_col_stride1]]\n"
-    "mov v0.16b, v19.16b\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v3.4s, v24.4s, v20.4s\n"
-    "ldr s25, [x21, x23]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "ldr s28, [%[inptr0], x22]\n"
-    "fmla v2.4s, v8.4s, v13.4s\n"
-    "ldr s24, [x28, %[input_col_stride1]]\n"
-    "fmla v3.4s, v8.4s, v23.4s\n"
-    "ldr s27, [x27, x23]\n"
-    "fmla v1.4s, v8.4s, v11.4s\n"
-    "ldr s7, [x21, x22]\n"
-    "fmla v0.4s, v8.4s, v12.4s\n"
-    "ldr s17, [x28, x23]\n"
-    "fmla v3.4s, v9.4s, v10.4s\n"
-    "ldr s5, [x27, x22]\n"
-    "fmla v2.4s, v9.4s, v11.4s\n"
-    "ldr s4, [x28, x22]\n"
-    "fmla v1.4s, v18.4s, v20.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v3.4s, v6.4s, v21.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v2.4s, v6.4s, v20.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v1.4s, v6.4s, v23.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v0.4s, v6.4s, v13.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v3.4s, v25.4s, v15.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v23.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "movi v25.16b, #0\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "fmov v26.4s, #6.0\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v21.4s\n"
-    "fmla v0.4s, v24.4s, v20.4s\n"
-    "fmax v3.4s, v3.4s, v25.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v2.4s, v27.4s, v21.4s\n"
-    "fmla v0.4s, v27.4s, v23.4s\n"
-    "fmin v3.4s, v3.4s, v26.4s\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v2.4s, v7.4s, v15.4s\n"
-    "fmla v0.4s, v7.4s, v10.4s\n"
-    "fmla v1.4s, v17.4s, v14.4s\n"
-    "fmla v2.4s, v5.4s, v14.4s\n"
-    "fmla v0.4s, v17.4s, v21.4s\n"
-    "fmax v1.4s, v1.4s, v25.4s\n"
-    "fmax v2.4s, v2.4s, v25.4s\n"
-    "fmla v0.4s, v5.4s, v15.4s\n"
-    "fmin v1.4s, v1.4s, v26.4s\n"
-    "fmin v2.4s, v2.4s, v26.4s\n"
-    "str s1, [x24]\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v4.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmax v0.4s, v0.4s, v25.4s\n"
-    "fmin v0.4s, v0.4s, v26.4s\n"
-    "str s0, [x24, %[output_col_stride1]]\n"
-    "add x24, x24, #4\n"
-    "7:\n"
-    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
deleted file mode 100644
index 2554436..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ /dev/null

@@ -1,2809 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x23, %[inptr0], %[input_row_stride]\n"
-    "add x19, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x22, %[outptr0], %[output_row_stride]\n"
-    "add x24, x23, %[input_row_stride]\n"
-    "add x20, x19, %[input_col_stride1]\n"
-    "and x27, %[n_channels], #3\n"
-    "add x25, x24, %[input_row_stride]\n"
-    "add x21, x20, %[input_col_stride1]\n"
-    "lsr x28, %[n_channels], #2\n"
-    "add x26, x25, %[input_row_stride]\n"
-    "cbz x28, 4f\n"
-    "1:\n"
-    "ldr q14, [%[wbptr]]\n"
-    "subs x28, x28, #1\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr q8, [%[wbptr], #16]\n"
-    "mov v10.16b, v14.16b\n"
-    "ldr q7, [%[wbptr], #32]\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr q6, [%[wbptr], #48]\n"
-    "mov v9.16b, v14.16b\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "ldr q15, [%[inptr0]]\n"
-    "fmla v12.4s, v15.4s, v8.4s\n"
-    "ldr q20, [x23]\n"
-    "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q17, [x24]\n"
-    "fmla v10.4s, v17.4s, v8.4s\n"
-    "ldr q16, [x23, %[input_col_stride1]]\n"
-    "fmla v12.4s, v20.4s, v5.4s\n"
-    "ldr q18, [%[inptr0], x19]\n"
-    "ldr q14, [x25]\n"
-    "ldr q15, [x24, %[input_col_stride1]]\n"
-    "fmla v12.4s, v13.4s, v7.4s\n"
-    "fmla v12.4s, v17.4s, v2.4s\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v11.4s, v18.4s, v8.4s\n"
-    "ldr q19, [x23, x19]\n"
-    "fmla v10.4s, v14.4s, v5.4s\n"
-    "ldr q20, [%[inptr0], x20]\n"
-    "fmla v12.4s, v15.4s, v1.4s\n"
-    "ldr q14, [x26]\n"
-    "fmla v11.4s, v19.4s, v5.4s\n"
-    "ldr q13, [x25, %[input_col_stride1]]\n"
-    "fmla v10.4s, v15.4s, v7.4s\n"
-    "ldr q17, [x24, x19]\n"
-    "fmla v12.4s, v19.4s, v3.4s\n"
-    "ldr q19, [x23, x20]\n"
-    "fmla v11.4s, v20.4s, v7.4s\n"
-    "ldr q18, [%[inptr0], x21]\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "ldr q16, [x26, %[input_col_stride1]]\n"
-    "fmla v12.4s, v17.4s, v0.4s\n"
-    "ldr q14, [x25, x19]\n"
-    "fmla v11.4s, v17.4s, v2.4s\n"
-    "ldr q15, [x24, x20]\n"
-    "fmla v10.4s, v13.4s, v4.4s\n"
-    "ldr q13, [x23, x21]\n"
-    "str q12, [%[outptr0]]\n"
-    "fmla v9.4s, v17.4s, v8.4s\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr q12, [x26, x19]\n"
-    "fmla v10.4s, v17.4s, v6.4s\n"
-    "ldr q20, [x25, x20]\n"
-    "fmla v9.4s, v14.4s, v5.4s\n"
-    "ldr q17, [x24, x21]\n"
-    "fmla v11.4s, v18.4s, v6.4s\n"
-    "ldr q19, [x26, x20]\n"
-    "fmla v10.4s, v16.4s, v1.4s\n"
-    "ldr q18, [x25, x21]\n"
-    "fmla v9.4s, v15.4s, v7.4s\n"
-    "ldr q16, [x26, x21]\n"
-    "fmla v11.4s, v15.4s, v1.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr q14, [%[wbptr]]\n"
-    "fmla v9.4s, v12.4s, v2.4s\n"
-    "ldr q8, [%[wbptr], #16]\n"
-    "fmla v11.4s, v13.4s, v3.4s\n"
-    "ldr q7, [%[wbptr], #32]\n"
-    "fmla v10.4s, v12.4s, v0.4s\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "fmla v9.4s, v20.4s, v4.4s\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "fmla v11.4s, v17.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "str q10, [x22]\n"
-    "mov v12.16b, v14.16b\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "str q11, [%[outptr0], %[output_col_stride1]]\n"
-    "mov v10.16b, v14.16b\n"
-    "mov v11.16b, v14.16b\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr q6, [%[wbptr], #48]\n"
-    "ldr q15, [%[inptr0]]\n"
-    "add x23, x23, #16\n"
-    "fmla v12.4s, v15.4s, v8.4s\n"
-    "ldr q20, [x23]\n"
-    "fmla v9.4s, v18.4s, v3.4s\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
-    "add x24, x24, #16\n"
-    "fmla v12.4s, v20.4s, v5.4s\n"
-    "ldr q17, [x24]\n"
-    "fmla v9.4s, v16.4s, v0.4s\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "fmla v10.4s, v17.4s, v8.4s\n"
-    "ldr q16, [x23, %[input_col_stride1]]\n"
-    "fmla v12.4s, v13.4s, v7.4s\n"
-    "ldr q18, [%[inptr0], x19]\n"
-    "str q9, [x22, %[output_col_stride1]]\n"
-    "add x25, x25, #16\n"
-    "mov v9.16b, v14.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "fmla v12.4s, v17.4s, v2.4s\n"
-    "ldr q14, [x25]\n"
-    "ldr q15, [x24, %[input_col_stride1]]\n"
-    "add x26, x26, #16\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "add x22, x22, #16\n"
-    "subs x28, x28, #1\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v11.4s, v18.4s, v8.4s\n"
-    "ldr q19, [x23, x19]\n"
-    "fmla v10.4s, v14.4s, v5.4s\n"
-    "ldr q20, [%[inptr0], x20]\n"
-    "fmla v12.4s, v15.4s, v1.4s\n"
-    "ldr q14, [x26]\n"
-    "fmla v11.4s, v19.4s, v5.4s\n"
-    "ldr q13, [x25, %[input_col_stride1]]\n"
-    "fmla v10.4s, v15.4s, v7.4s\n"
-    "ldr q17, [x24, x19]\n"
-    "fmla v12.4s, v19.4s, v3.4s\n"
-    "ldr q19, [x23, x20]\n"
-    "fmla v11.4s, v20.4s, v7.4s\n"
-    "ldr q18, [%[inptr0], x21]\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "ldr q16, [x26, %[input_col_stride1]]\n"
-    "fmla v12.4s, v17.4s, v0.4s\n"
-    "ldr q14, [x25, x19]\n"
-    "fmla v11.4s, v17.4s, v2.4s\n"
-    "ldr q15, [x24, x20]\n"
-    "fmla v10.4s, v13.4s, v4.4s\n"
-    "ldr q13, [x23, x21]\n"
-    "str q12, [%[outptr0]]\n"
-    "fmla v9.4s, v17.4s, v8.4s\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr q12, [x26, x19]\n"
-    "fmla v10.4s, v17.4s, v6.4s\n"
-    "ldr q20, [x25, x20]\n"
-    "fmla v9.4s, v14.4s, v5.4s\n"
-    "ldr q17, [x24, x21]\n"
-    "fmla v11.4s, v18.4s, v6.4s\n"
-    "ldr q19, [x26, x20]\n"
-    "fmla v10.4s, v16.4s, v1.4s\n"
-    "ldr q18, [x25, x21]\n"
-    "fmla v9.4s, v15.4s, v7.4s\n"
-    "ldr q16, [x26, x21]\n"
-    "fmla v11.4s, v15.4s, v1.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v12.4s, v2.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v11.4s, v13.4s, v3.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v10.4s, v12.4s, v0.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v9.4s, v20.4s, v4.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v11.4s, v17.4s, v0.4s\n"
-    "add x26, x26, #16\n"
-    "str q10, [x22]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "str q11, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v9.4s, v18.4s, v3.4s\n"
-    "fmla v9.4s, v16.4s, v0.4s\n"
-    "str q9, [x22, %[output_col_stride1]]\n"
-    "add x22, x22, #16\n"
-    "4:\n"
-    "cbz x27, 7f\n"
-    "ldr s14, [%[wbptr]]\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr s8, [%[wbptr], #4]\n"
-    "mov v10.16b, v14.16b\n"
-    "ldr s7, [%[wbptr], #8]\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr s6, [%[wbptr], #12]\n"
-    "mov v9.16b, v14.16b\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "subs x27, x27, #1\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "ldr s15, [%[inptr0]]\n"
-    "ldr s20, [x23]\n"
-    "fmla v12.4s, v15.4s, v8.4s\n"
-    "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s17, [x24]\n"
-    "ldr s16, [x23, %[input_col_stride1]]\n"
-    "fmla v10.4s, v17.4s, v8.4s\n"
-    "ldr s18, [%[inptr0], x19]\n"
-    "fmla v12.4s, v20.4s, v5.4s\n"
-    "ldr s14, [x25]\n"
-    "ldr s15, [x24, %[input_col_stride1]]\n"
-    "fmla v12.4s, v13.4s, v7.4s\n"
-    "fmla v12.4s, v17.4s, v2.4s\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v11.4s, v18.4s, v8.4s\n"
-    "ldr s19, [x23, x19]\n"
-    "fmla v10.4s, v14.4s, v5.4s\n"
-    "ldr s20, [%[inptr0], x20]\n"
-    "fmla v12.4s, v15.4s, v1.4s\n"
-    "ldr s14, [x26]\n"
-    "fmla v11.4s, v19.4s, v5.4s\n"
-    "ldr s13, [x25, %[input_col_stride1]]\n"
-    "fmla v10.4s, v15.4s, v7.4s\n"
-    "ldr s17, [x24, x19]\n"
-    "fmla v12.4s, v19.4s, v3.4s\n"
-    "ldr s19, [x23, x20]\n"
-    "fmla v11.4s, v20.4s, v7.4s\n"
-    "ldr s18, [%[inptr0], x21]\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "ldr s16, [x26, %[input_col_stride1]]\n"
-    "fmla v12.4s, v17.4s, v0.4s\n"
-    "ldr s14, [x25, x19]\n"
-    "fmla v11.4s, v17.4s, v2.4s\n"
-    "ldr s15, [x24, x20]\n"
-    "fmla v10.4s, v13.4s, v4.4s\n"
-    "ldr s13, [x23, x21]\n"
-    "str s12, [%[outptr0]]\n"
-    "fmla v9.4s, v17.4s, v8.4s\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr s12, [x26, x19]\n"
-    "fmla v10.4s, v17.4s, v6.4s\n"
-    "ldr s20, [x25, x20]\n"
-    "fmla v9.4s, v14.4s, v5.4s\n"
-    "ldr s17, [x24, x21]\n"
-    "fmla v11.4s, v18.4s, v6.4s\n"
-    "ldr s19, [x26, x20]\n"
-    "fmla v10.4s, v16.4s, v1.4s\n"
-    "ldr s18, [x25, x21]\n"
-    "fmla v9.4s, v15.4s, v7.4s\n"
-    "ldr s16, [x26, x21]\n"
-    "fmla v11.4s, v15.4s, v1.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr s14, [%[wbptr]]\n"
-    "fmla v9.4s, v12.4s, v2.4s\n"
-    "ldr s8, [%[wbptr], #4]\n"
-    "fmla v11.4s, v13.4s, v3.4s\n"
-    "ldr s7, [%[wbptr], #8]\n"
-    "fmla v10.4s, v12.4s, v0.4s\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "fmla v9.4s, v20.4s, v4.4s\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "fmla v11.4s, v17.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "str s10, [x22]\n"
-    "mov v12.16b, v14.16b\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "str s11, [%[outptr0], %[output_col_stride1]]\n"
-    "mov v10.16b, v14.16b\n"
-    "mov v11.16b, v14.16b\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr s6, [%[wbptr], #12]\n"
-    "ldr s15, [%[inptr0]]\n"
-    "add x23, x23, #4\n"
-    "fmla v12.4s, v15.4s, v8.4s\n"
-    "ldr s20, [x23]\n"
-    "fmla v9.4s, v18.4s, v3.4s\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
-    "add x24, x24, #4\n"
-    "fmla v12.4s, v20.4s, v5.4s\n"
-    "ldr s17, [x24]\n"
-    "fmla v9.4s, v16.4s, v0.4s\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "fmla v10.4s, v17.4s, v8.4s\n"
-    "ldr s16, [x23, %[input_col_stride1]]\n"
-    "fmla v12.4s, v13.4s, v7.4s\n"
-    "ldr s18, [%[inptr0], x19]\n"
-    "str s9, [x22, %[output_col_stride1]]\n"
-    "add x25, x25, #4\n"
-    "mov v9.16b, v14.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "fmla v12.4s, v17.4s, v2.4s\n"
-    "ldr s14, [x25]\n"
-    "ldr s15, [x24, %[input_col_stride1]]\n"
-    "add x26, x26, #4\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "add x22, x22, #4\n"
-    "subs x27, x27, #1\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v11.4s, v18.4s, v8.4s\n"
-    "ldr s19, [x23, x19]\n"
-    "fmla v10.4s, v14.4s, v5.4s\n"
-    "ldr s20, [%[inptr0], x20]\n"
-    "fmla v12.4s, v15.4s, v1.4s\n"
-    "ldr s14, [x26]\n"
-    "fmla v11.4s, v19.4s, v5.4s\n"
-    "ldr s13, [x25, %[input_col_stride1]]\n"
-    "fmla v10.4s, v15.4s, v7.4s\n"
-    "ldr s17, [x24, x19]\n"
-    "fmla v12.4s, v19.4s, v3.4s\n"
-    "ldr s19, [x23, x20]\n"
-    "fmla v11.4s, v20.4s, v7.4s\n"
-    "ldr s18, [%[inptr0], x21]\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "ldr s16, [x26, %[input_col_stride1]]\n"
-    "fmla v12.4s, v17.4s, v0.4s\n"
-    "ldr s14, [x25, x19]\n"
-    "fmla v11.4s, v17.4s, v2.4s\n"
-    "ldr s15, [x24, x20]\n"
-    "fmla v10.4s, v13.4s, v4.4s\n"
-    "ldr s13, [x23, x21]\n"
-    "str s12, [%[outptr0]]\n"
-    "fmla v9.4s, v17.4s, v8.4s\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr s12, [x26, x19]\n"
-    "fmla v10.4s, v17.4s, v6.4s\n"
-    "ldr s20, [x25, x20]\n"
-    "fmla v9.4s, v14.4s, v5.4s\n"
-    "ldr s17, [x24, x21]\n"
-    "fmla v11.4s, v18.4s, v6.4s\n"
-    "ldr s19, [x26, x20]\n"
-    "fmla v10.4s, v16.4s, v1.4s\n"
-    "ldr s18, [x25, x21]\n"
-    "fmla v9.4s, v15.4s, v7.4s\n"
-    "ldr s16, [x26, x21]\n"
-    "fmla v11.4s, v15.4s, v1.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v12.4s, v2.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v11.4s, v13.4s, v3.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v10.4s, v12.4s, v0.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v9.4s, v20.4s, v4.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v11.4s, v17.4s, v0.4s\n"
-    "add x26, x26, #4\n"
-    "str s10, [x22]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "str s11, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v9.4s, v18.4s, v3.4s\n"
-    "fmla v9.4s, v16.4s, v0.4s\n"
-    "str s9, [x22, %[output_col_stride1]]\n"
-    "add x22, x22, #4\n"
-    "7:\n"
-    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
-    : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  __asm __volatile(
-    "mov x23, xzr\n"
-    "mov x24, xzr\n"
-    "and x25, %[n_channels], #3\n"
-    "lsr x26, %[n_channels], #2\n"
-    "cbz x26, 4f\n"
-    "1:\n"
-    "ldr q13, [%[wbptr]]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "mov v10.16b, v13.16b\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v8.16b, v13.16b\n"
-    "ldr q6, [%[wbptr], #32]\n"
-    "mov v9.16b, v13.16b\n"
-    "ldr q5, [%[wbptr], #48]\n"
-    "mov v7.16b, v13.16b\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "ldr x27, [%[inptrs], 120]\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "subs x26, x26, #1\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "ldr q14, [x19, x23]\n"
-    "fmla v10.4s, v14.4s, v12.4s\n"
-    "ldr q18, [x20, x23]\n"
-    "ldr q14, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "ldr q19, [x19, x23]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "fmla v10.4s, v18.4s, v11.4s\n"
-    "ldr q15, [x20, x23]\n"
-    "ldr q18, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "ldr q13, [x19, x23]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v8.4s, v14.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v10.4s, v15.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v9.4s, v13.4s, v12.4s\n"
-    "ldr q14, [x20, x23]\n"
-    "ldr q17, [x19, x23]\n"
-    "ldr x22, [%[inptrs], 160]\n"
-    "fmla v8.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 128]\n"
-    "fmla v10.4s, v13.4s, v5.4s\n"
-    "ldr q15, [x22, x23]\n"
-    "fmla v9.4s, v14.4s, v11.4s\n"
-    "ldr q19, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v8.4s, v18.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 168]\n"
-    "fmla v10.4s, v18.4s, v1.4s\n"
-    "ldr q13, [x21, x23]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr q18, [x20, x23]\n"
-    "fmla v7.4s, v13.4s, v12.4s\n"
-    "ldr q17, [x19, x23]\n"
-    "fmla v8.4s, v15.4s, v2.4s\n"
-    "ldr q15, [x22, x23]\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr x27, [%[inptrs], 136]\n"
-    "fmla v9.4s, v13.4s, v2.4s\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v8.4s, v19.4s, v4.4s\n"
-    "ldr q19, [x21, x23]\n"
-    "fmla v10.4s, v13.4s, v0.4s\n"
-    "ldr q12, [x20, x23]\n"
-    "fmla v9.4s, v18.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 176]\n"
-    "fmla v7.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 144]\n"
-    "fmla v8.4s, v13.4s, v5.4s\n"
-    "ldr q11, [x22, x23]\n"
-    "ldr q13, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v9.4s, v17.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 184]\n"
-    "fmla v7.4s, v19.4s, v6.4s\n"
-    "ldr q14, [x21, x23]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr q17, [x22, x23]\n"
-    "ldr x27, [%[inptrs], 152]\n"
-    "ldr x22, [%[inptrs], 192]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str q10, [x21, x24]\n"
-    "fmla v7.4s, v11.4s, v2.4s\n"
-    "fmla v8.4s, v16.4s, v3.4s\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr q15, [x22, x23]\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v9.4s, v12.4s, v3.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v7.4s, v13.4s, v4.4s\n"
-    "ldr q13, [%[wbptr]]\n"
-    "fmla v8.4s, v11.4s, v0.4s\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v10.16b, v13.16b\n"
-    "ldr q6, [%[wbptr], #32]\n"
-    "fmla v9.4s, v14.4s, v0.4s\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "fmla v7.4s, v14.4s, v5.4s\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "str q8, [x28, x24]\n"
-    "add x23, x23, #16\n"
-    "mov v8.16b, v13.16b\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "str q9, [x21, x24]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "fmla v7.4s, v17.4s, v1.4s\n"
-    "ldr q5, [%[wbptr], #48]\n"
-    "mov v9.16b, v13.16b\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr x27, [%[inptrs], 120]\n"
-    "subs x26, x26, #1\n"
-    "fmla v7.4s, v16.4s, v3.4s\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "ldr q14, [x19, x23]\n"
-    "fmla v10.4s, v14.4s, v12.4s\n"
-    "ldr q18, [x20, x23]\n"
-    "ldr q14, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "fmla v7.4s, v15.4s, v0.4s\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "ldr q19, [x19, x23]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "fmla v10.4s, v18.4s, v11.4s\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr q15, [x20, x23]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "str q7, [x28, x24]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "mov v7.16b, v13.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr q13, [x19, x23]\n"
-    "ldr q18, [x21, x23]\n"
-    "add x24, x24, #16\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v8.4s, v14.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v10.4s, v15.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v9.4s, v13.4s, v12.4s\n"
-    "ldr q14, [x20, x23]\n"
-    "ldr q17, [x19, x23]\n"
-    "ldr x22, [%[inptrs], 160]\n"
-    "fmla v8.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 128]\n"
-    "fmla v10.4s, v13.4s, v5.4s\n"
-    "ldr q15, [x22, x23]\n"
-    "fmla v9.4s, v14.4s, v11.4s\n"
-    "ldr q19, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v8.4s, v18.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 168]\n"
-    "fmla v10.4s, v18.4s, v1.4s\n"
-    "ldr q13, [x21, x23]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr q18, [x20, x23]\n"
-    "fmla v7.4s, v13.4s, v12.4s\n"
-    "ldr q17, [x19, x23]\n"
-    "fmla v8.4s, v15.4s, v2.4s\n"
-    "ldr q15, [x22, x23]\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr x27, [%[inptrs], 136]\n"
-    "fmla v9.4s, v13.4s, v2.4s\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v8.4s, v19.4s, v4.4s\n"
-    "ldr q19, [x21, x23]\n"
-    "fmla v10.4s, v13.4s, v0.4s\n"
-    "ldr q12, [x20, x23]\n"
-    "fmla v9.4s, v18.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 176]\n"
-    "fmla v7.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 144]\n"
-    "fmla v8.4s, v13.4s, v5.4s\n"
-    "ldr q11, [x22, x23]\n"
-    "ldr q13, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v9.4s, v17.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 184]\n"
-    "fmla v7.4s, v19.4s, v6.4s\n"
-    "ldr q14, [x21, x23]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr q17, [x22, x23]\n"
-    "ldr x27, [%[inptrs], 152]\n"
-    "ldr x22, [%[inptrs], 192]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str q10, [x21, x24]\n"
-    "fmla v7.4s, v11.4s, v2.4s\n"
-    "fmla v8.4s, v16.4s, v3.4s\n"
-    "ldr q16, [x27, x23]\n"
-    "ldr q15, [x22, x23]\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v9.4s, v12.4s, v3.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v7.4s, v13.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v8.4s, v11.4s, v0.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v9.4s, v14.4s, v0.4s\n"
-    "fmla v7.4s, v14.4s, v5.4s\n"
-    "str q8, [x28, x24]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "str q9, [x21, x24]\n"
-    "fmla v7.4s, v17.4s, v1.4s\n"
-    "fmla v7.4s, v16.4s, v3.4s\n"
-    "fmla v7.4s, v15.4s, v0.4s\n"
-    "str q7, [x28, x24]\n"
-    "add x24, x24, #16\n"
-    "4:\n"
-    "cbz x25, 7f\n"
-    "ldr s13, [%[wbptr]]\n"
-    "mov v10.16b, v13.16b\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v8.16b, v13.16b\n"
-    "ldr s6, [%[wbptr], #8]\n"
-    "mov v9.16b, v13.16b\n"
-    "ldr s5, [%[wbptr], #12]\n"
-    "mov v7.16b, v13.16b\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "ldr x27, [%[inptrs], 120]\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "subs x25, x25, #1\n"
-    "ldr s14, [x19, x23]\n"
-    "ldr s18, [x20, x23]\n"
-    "fmla v10.4s, v14.4s, v12.4s\n"
-    "ldr s14, [x21, x23]\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "ldr s19, [x19, x23]\n"
-    "fmla v10.4s, v18.4s, v11.4s\n"
-    "ldr s15, [x20, x23]\n"
-    "ldr s18, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "ldr s13, [x19, x23]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v8.4s, v14.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v10.4s, v15.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v9.4s, v13.4s, v12.4s\n"
-    "ldr s14, [x20, x23]\n"
-    "ldr s17, [x19, x23]\n"
-    "ldr x22, [%[inptrs], 160]\n"
-    "fmla v8.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 128]\n"
-    "fmla v10.4s, v13.4s, v5.4s\n"
-    "ldr s15, [x22, x23]\n"
-    "fmla v9.4s, v14.4s, v11.4s\n"
-    "ldr s19, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v8.4s, v18.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 168]\n"
-    "fmla v10.4s, v18.4s, v1.4s\n"
-    "ldr s13, [x21, x23]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr s18, [x20, x23]\n"
-    "fmla v7.4s, v13.4s, v12.4s\n"
-    "ldr s17, [x19, x23]\n"
-    "fmla v8.4s, v15.4s, v2.4s\n"
-    "ldr s15, [x22, x23]\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr x27, [%[inptrs], 136]\n"
-    "fmla v9.4s, v13.4s, v2.4s\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v8.4s, v19.4s, v4.4s\n"
-    "ldr s19, [x21, x23]\n"
-    "fmla v10.4s, v13.4s, v0.4s\n"
-    "ldr s12, [x20, x23]\n"
-    "fmla v9.4s, v18.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 176]\n"
-    "fmla v7.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 144]\n"
-    "fmla v8.4s, v13.4s, v5.4s\n"
-    "ldr s11, [x22, x23]\n"
-    "ldr s13, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v9.4s, v17.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 184]\n"
-    "fmla v7.4s, v19.4s, v6.4s\n"
-    "ldr s14, [x21, x23]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr s17, [x22, x23]\n"
-    "ldr x27, [%[inptrs], 152]\n"
-    "ldr x22, [%[inptrs], 192]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str s10, [x21, x24]\n"
-    "fmla v7.4s, v11.4s, v2.4s\n"
-    "fmla v8.4s, v16.4s, v3.4s\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr s15, [x22, x23]\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v9.4s, v12.4s, v3.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v7.4s, v13.4s, v4.4s\n"
-    "ldr s13, [%[wbptr]]\n"
-    "fmla v8.4s, v11.4s, v0.4s\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v10.16b, v13.16b\n"
-    "ldr s6, [%[wbptr], #8]\n"
-    "fmla v9.4s, v14.4s, v0.4s\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "fmla v7.4s, v14.4s, v5.4s\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "str s8, [x28, x24]\n"
-    "add x23, x23, #4\n"
-    "mov v8.16b, v13.16b\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "str s9, [x21, x24]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "fmla v7.4s, v17.4s, v1.4s\n"
-    "ldr s5, [%[wbptr], #12]\n"
-    "mov v9.16b, v13.16b\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr x27, [%[inptrs], 120]\n"
-    "subs x25, x25, #1\n"
-    "fmla v7.4s, v16.4s, v3.4s\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "ldr s14, [x19, x23]\n"
-    "fmla v10.4s, v14.4s, v12.4s\n"
-    "ldr s18, [x20, x23]\n"
-    "ldr s14, [x21, x23]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "fmla v7.4s, v15.4s, v0.4s\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "ldr s19, [x19, x23]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "fmla v10.4s, v18.4s, v11.4s\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr s15, [x20, x23]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "str s7, [x28, x24]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "mov v7.16b, v13.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr s13, [x19, x23]\n"
-    "ldr s18, [x21, x23]\n"
-    "add x24, x24, #4\n"
-    "fmla v10.4s, v14.4s, v2.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v8.4s, v14.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v10.4s, v15.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v9.4s, v13.4s, v12.4s\n"
-    "ldr s14, [x20, x23]\n"
-    "ldr s17, [x19, x23]\n"
-    "ldr x22, [%[inptrs], 160]\n"
-    "fmla v8.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 128]\n"
-    "fmla v10.4s, v13.4s, v5.4s\n"
-    "ldr s15, [x22, x23]\n"
-    "fmla v9.4s, v14.4s, v11.4s\n"
-    "ldr s19, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v8.4s, v18.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 168]\n"
-    "fmla v10.4s, v18.4s, v1.4s\n"
-    "ldr s13, [x21, x23]\n"
-    "fmla v9.4s, v17.4s, v6.4s\n"
-    "ldr s18, [x20, x23]\n"
-    "fmla v7.4s, v13.4s, v12.4s\n"
-    "ldr s17, [x19, x23]\n"
-    "fmla v8.4s, v15.4s, v2.4s\n"
-    "ldr s15, [x22, x23]\n"
-    "fmla v10.4s, v14.4s, v3.4s\n"
-    "ldr x27, [%[inptrs], 136]\n"
-    "fmla v9.4s, v13.4s, v2.4s\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v8.4s, v19.4s, v4.4s\n"
-    "ldr s19, [x21, x23]\n"
-    "fmla v10.4s, v13.4s, v0.4s\n"
-    "ldr s12, [x20, x23]\n"
-    "fmla v9.4s, v18.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 176]\n"
-    "fmla v7.4s, v16.4s, v11.4s\n"
-    "ldr x27, [%[inptrs], 144]\n"
-    "fmla v8.4s, v13.4s, v5.4s\n"
-    "ldr s11, [x22, x23]\n"
-    "ldr s13, [x27, x23]\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v9.4s, v17.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 184]\n"
-    "fmla v7.4s, v19.4s, v6.4s\n"
-    "ldr s14, [x21, x23]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr s17, [x22, x23]\n"
-    "ldr x27, [%[inptrs], 152]\n"
-    "ldr x22, [%[inptrs], 192]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str s10, [x21, x24]\n"
-    "fmla v7.4s, v11.4s, v2.4s\n"
-    "fmla v8.4s, v16.4s, v3.4s\n"
-    "ldr s16, [x27, x23]\n"
-    "ldr s15, [x22, x23]\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v9.4s, v12.4s, v3.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v7.4s, v13.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v8.4s, v11.4s, v0.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v9.4s, v14.4s, v0.4s\n"
-    "fmla v7.4s, v14.4s, v5.4s\n"
-    "str s8, [x28, x24]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "str s9, [x21, x24]\n"
-    "fmla v7.4s, v17.4s, v1.4s\n"
-    "fmla v7.4s, v16.4s, v3.4s\n"
-    "fmla v7.4s, v15.4s, v0.4s\n"
-    "str s7, [x28, x24]\n"
-    "add x24, x24, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x24, %[inptr0], %[input_row_stride]\n"
-    "add x27, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x19, %[outptr0], %[output_row_stride]\n"
-    "add x25, x24, %[input_row_stride]\n"
-    "add x23, x27, %[input_col_stride1]\n"
-    "and x20, %[n_channels], #3\n"
-    "add x28, x25, %[input_row_stride]\n"
-    "add x22, x23, %[input_col_stride1]\n"
-    "lsr x21, %[n_channels], #2\n"
-    "add x26, x28, %[input_row_stride]\n"
-    "cbz x21, 4f\n"
-    "1:\n"
-    "ldr q16, [%[wbptr]]\n"
-    "subs x21, x21, #1\n"
-    "mov v3.16b, v16.16b\n"
-    "ldr q4, [%[wbptr], #16]\n"
-    "mov v1.16b, v16.16b\n"
-    "ldr q5, [%[wbptr], #32]\n"
-    "mov v2.16b, v16.16b\n"
-    "ldr q12, [%[wbptr], #48]\n"
-    "mov v0.16b, v16.16b\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "ldr q6, [%[wbptr], #96]\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "ldr q7, [%[wbptr], #144]\n"
-    "ldr q21, [%[inptr0]]\n"
-    "fmla v3.4s, v21.4s, v4.4s\n"
-    "ldr q23, [x24]\n"
-    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q14, [x25]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr q13, [x24, %[input_col_stride1]]\n"
-    "fmla v3.4s, v23.4s, v11.4s\n"
-    "ldr q18, [%[inptr0], x27]\n"
-    "ldr q15, [x28]\n"
-    "ldr q22, [x25, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v5.4s\n"
-    "fmla v3.4s, v14.4s, v9.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v3.4s, v13.4s, v10.4s\n"
-    "ldr q17, [x24, x27]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr q20, [%[inptr0], x23]\n"
-    "fmla v1.4s, v15.4s, v11.4s\n"
-    "ldr q19, [x26]\n"
-    "fmla v3.4s, v18.4s, v12.4s\n"
-    "ldr q13, [x28, %[input_col_stride1]]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr q14, [x25, x27]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "ldr q15, [x24, x23]\n"
-    "fmla v3.4s, v22.4s, v8.4s\n"
-    "ldr q16, [%[inptr0], x22]\n"
-    "fmla v2.4s, v20.4s, v5.4s\n"
-    "ldr q20, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v19.4s, v9.4s\n"
-    "ldr q19, [x28, x27]\n"
-    "fmla v3.4s, v17.4s, v6.4s\n"
-    "ldr q21, [x25, x23]\n"
-    "fmla v2.4s, v14.4s, v9.4s\n"
-    "ldr q22, [x24, x22]\n"
-    "fmla v1.4s, v13.4s, v10.4s\n"
-    "ldr q23, [x26, x27]\n"
-    "fmla v3.4s, v14.4s, v7.4s\n"
-    "ldr q18, [x28, x23]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "ldr q13, [x25, x22]\n"
-    "fmla v1.4s, v14.4s, v12.4s\n"
-    "ldr q14, [x26, x23]\n"
-    "fmla v2.4s, v15.4s, v10.4s\n"
-    "ldr q17, [x28, x22]\n"
-    "fmla v0.4s, v19.4s, v11.4s\n"
-    "ldr q15, [x26, x22]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr q16, [%[wbptr]]\n"
-    "fmla v0.4s, v21.4s, v5.4s\n"
-    "ldr q4, [%[wbptr], #16]\n"
-    "fmla v1.4s, v19.4s, v6.4s\n"
-    "ldr q11, [%[wbptr], #64]\n"
-    "fmla v2.4s, v21.4s, v8.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "ldr q5, [%[wbptr], #32]\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v2.4s, v22.4s, v6.4s\n"
-    "ldr q21, [%[inptr0]]\n"
-    "fmla v0.4s, v18.4s, v10.4s\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "movi v20.16b, #0\n"
-    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v13.4s, v7.4s\n"
-    "ldr q18, [%[inptr0], x27]\n"
-    "fmla v0.4s, v13.4s, v12.4s\n"
-    "ldr q10, [%[wbptr], #80]\n"
-    "fmax v3.4s, v3.4s, v20.4s\n"
-    "add x24, x24, #16\n"
-    "fmax v2.4s, v2.4s, v20.4s\n"
-    "ldr q23, [x24]\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v0.4s, v14.4s, v8.4s\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v1.4s, v1.4s, v20.4s\n"
-    "mov v3.16b, v16.16b\n"
-    "ldr q12, [%[wbptr], #48]\n"
-    "str q1, [x19]\n"
-    "fmla v0.4s, v17.4s, v6.4s\n"
-    "mov v1.16b, v16.16b\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "mov v2.16b, v16.16b\n"
-    "ldr q13, [x24, %[input_col_stride1]]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "ldr q6, [%[wbptr], #96]\n"
-    "fmla v3.4s, v21.4s, v4.4s\n"
-    "add x25, x25, #16\n"
-    "ldr q14, [x25]\n"
-    "add x28, x28, #16\n"
-    "fmax v0.4s, v0.4s, v20.4s\n"
-    "ldr q7, [%[wbptr], #144]\n"
-    "fmla v3.4s, v23.4s, v11.4s\n"
-    "ldr q15, [x28]\n"
-    "str q0, [x19, %[output_col_stride1]]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "mov v0.16b, v16.16b\n"
-    "ldr q22, [x25, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v5.4s\n"
-    "add x26, x26, #16\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "add x19, x19, #16\n"
-    "subs x21, x21, #1\n"
-    "fmla v3.4s, v14.4s, v9.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v3.4s, v13.4s, v10.4s\n"
-    "ldr q17, [x24, x27]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr q20, [%[inptr0], x23]\n"
-    "fmla v1.4s, v15.4s, v11.4s\n"
-    "ldr q19, [x26]\n"
-    "fmla v3.4s, v18.4s, v12.4s\n"
-    "ldr q13, [x28, %[input_col_stride1]]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr q14, [x25, x27]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "ldr q15, [x24, x23]\n"
-    "fmla v3.4s, v22.4s, v8.4s\n"
-    "ldr q16, [%[inptr0], x22]\n"
-    "fmla v2.4s, v20.4s, v5.4s\n"
-    "ldr q20, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v19.4s, v9.4s\n"
-    "ldr q19, [x28, x27]\n"
-    "fmla v3.4s, v17.4s, v6.4s\n"
-    "ldr q21, [x25, x23]\n"
-    "fmla v2.4s, v14.4s, v9.4s\n"
-    "ldr q22, [x24, x22]\n"
-    "fmla v1.4s, v13.4s, v10.4s\n"
-    "ldr q23, [x26, x27]\n"
-    "fmla v3.4s, v14.4s, v7.4s\n"
-    "ldr q18, [x28, x23]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "ldr q13, [x25, x22]\n"
-    "fmla v1.4s, v14.4s, v12.4s\n"
-    "ldr q14, [x26, x23]\n"
-    "fmla v2.4s, v15.4s, v10.4s\n"
-    "ldr q17, [x28, x22]\n"
-    "fmla v0.4s, v19.4s, v11.4s\n"
-    "ldr q15, [x26, x22]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v21.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v1.4s, v19.4s, v6.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v2.4s, v21.4s, v8.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v2.4s, v22.4s, v6.4s\n"
-    "movi v20.16b, #0\n"
-    "fmla v0.4s, v18.4s, v10.4s\n"
-    "fmax v3.4s, v3.4s, v20.4s\n"
-    "fmla v2.4s, v13.4s, v7.4s\n"
-    "fmax v1.4s, v1.4s, v20.4s\n"
-    "str q3, [%[outptr0]]\n"
-    "fmla v0.4s, v13.4s, v12.4s\n"
-    "str q1, [x19]\n"
-    "fmax v2.4s, v2.4s, v20.4s\n"
-    "fmla v0.4s, v14.4s, v8.4s\n"
-    "str q2, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v0.4s, v17.4s, v6.4s\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmax v0.4s, v0.4s, v20.4s\n"
-    "str q0, [x19, %[output_col_stride1]]\n"
-    "add x19, x19, #16\n"
-    "4:\n"
-    "cbz x20, 7f\n"
-    "ldr s16, [%[wbptr]]\n"
-    "mov v3.16b, v16.16b\n"
-    "ldr s4, [%[wbptr], #4]\n"
-    "mov v1.16b, v16.16b\n"
-    "ldr s5, [%[wbptr], #8]\n"
-    "mov v2.16b, v16.16b\n"
-    "ldr s12, [%[wbptr], #12]\n"
-    "mov v0.16b, v16.16b\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "subs x20, x20, #1\n"
-    "ldr s6, [%[wbptr], #24]\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "ldr s7, [%[wbptr], #36]\n"
-    "ldr s21, [%[inptr0]]\n"
-    "ldr s23, [x24]\n"
-    "fmla v3.4s, v21.4s, v4.4s\n"
-    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s14, [x25]\n"
-    "ldr s13, [x24, %[input_col_stride1]]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr s18, [%[inptr0], x27]\n"
-    "fmla v3.4s, v23.4s, v11.4s\n"
-    "ldr s15, [x28]\n"
-    "ldr s22, [x25, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v5.4s\n"
-    "fmla v3.4s, v14.4s, v9.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v3.4s, v13.4s, v10.4s\n"
-    "ldr s17, [x24, x27]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr s20, [%[inptr0], x23]\n"
-    "fmla v1.4s, v15.4s, v11.4s\n"
-    "ldr s19, [x26]\n"
-    "fmla v3.4s, v18.4s, v12.4s\n"
-    "ldr s13, [x28, %[input_col_stride1]]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr s14, [x25, x27]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "ldr s15, [x24, x23]\n"
-    "fmla v3.4s, v22.4s, v8.4s\n"
-    "ldr s16, [%[inptr0], x22]\n"
-    "fmla v2.4s, v20.4s, v5.4s\n"
-    "ldr s20, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v19.4s, v9.4s\n"
-    "ldr s19, [x28, x27]\n"
-    "fmla v3.4s, v17.4s, v6.4s\n"
-    "ldr s21, [x25, x23]\n"
-    "fmla v2.4s, v14.4s, v9.4s\n"
-    "ldr s22, [x24, x22]\n"
-    "fmla v1.4s, v13.4s, v10.4s\n"
-    "ldr s23, [x26, x27]\n"
-    "fmla v3.4s, v14.4s, v7.4s\n"
-    "ldr s18, [x28, x23]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "ldr s13, [x25, x22]\n"
-    "fmla v1.4s, v14.4s, v12.4s\n"
-    "ldr s14, [x26, x23]\n"
-    "fmla v2.4s, v15.4s, v10.4s\n"
-    "ldr s17, [x28, x22]\n"
-    "fmla v0.4s, v19.4s, v11.4s\n"
-    "ldr s15, [x26, x22]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr s16, [%[wbptr]]\n"
-    "fmla v0.4s, v21.4s, v5.4s\n"
-    "ldr s4, [%[wbptr], #4]\n"
-    "fmla v1.4s, v19.4s, v6.4s\n"
-    "ldr s11, [%[wbptr], #16]\n"
-    "fmla v2.4s, v21.4s, v8.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "ldr s5, [%[wbptr], #8]\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v2.4s, v22.4s, v6.4s\n"
-    "ldr s21, [%[inptr0]]\n"
-    "fmla v0.4s, v18.4s, v10.4s\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "movi v20.16b, #0\n"
-    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v13.4s, v7.4s\n"
-    "ldr s18, [%[inptr0], x27]\n"
-    "fmla v0.4s, v13.4s, v12.4s\n"
-    "ldr s10, [%[wbptr], #20]\n"
-    "fmax v3.4s, v3.4s, v20.4s\n"
-    "add x24, x24, #4\n"
-    "fmax v2.4s, v2.4s, v20.4s\n"
-    "ldr s23, [x24]\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v0.4s, v14.4s, v8.4s\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v1.4s, v1.4s, v20.4s\n"
-    "mov v3.16b, v16.16b\n"
-    "ldr s12, [%[wbptr], #12]\n"
-    "str s1, [x19]\n"
-    "fmla v0.4s, v17.4s, v6.4s\n"
-    "mov v1.16b, v16.16b\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "mov v2.16b, v16.16b\n"
-    "ldr s13, [x24, %[input_col_stride1]]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "ldr s6, [%[wbptr], #24]\n"
-    "fmla v3.4s, v21.4s, v4.4s\n"
-    "add x25, x25, #4\n"
-    "ldr s14, [x25]\n"
-    "add x28, x28, #4\n"
-    "fmax v0.4s, v0.4s, v20.4s\n"
-    "ldr s7, [%[wbptr], #36]\n"
-    "fmla v3.4s, v23.4s, v11.4s\n"
-    "ldr s15, [x28]\n"
-    "str s0, [x19, %[output_col_stride1]]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "mov v0.16b, v16.16b\n"
-    "ldr s22, [x25, %[input_col_stride1]]\n"
-    "fmla v3.4s, v19.4s, v5.4s\n"
-    "add x26, x26, #4\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "add x19, x19, #4\n"
-    "subs x20, x20, #1\n"
-    "fmla v3.4s, v14.4s, v9.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v3.4s, v13.4s, v10.4s\n"
-    "ldr s17, [x24, x27]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr s20, [%[inptr0], x23]\n"
-    "fmla v1.4s, v15.4s, v11.4s\n"
-    "ldr s19, [x26]\n"
-    "fmla v3.4s, v18.4s, v12.4s\n"
-    "ldr s13, [x28, %[input_col_stride1]]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr s14, [x25, x27]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "ldr s15, [x24, x23]\n"
-    "fmla v3.4s, v22.4s, v8.4s\n"
-    "ldr s16, [%[inptr0], x22]\n"
-    "fmla v2.4s, v20.4s, v5.4s\n"
-    "ldr s20, [x26, %[input_col_stride1]]\n"
-    "fmla v1.4s, v19.4s, v9.4s\n"
-    "ldr s19, [x28, x27]\n"
-    "fmla v3.4s, v17.4s, v6.4s\n"
-    "ldr s21, [x25, x23]\n"
-    "fmla v2.4s, v14.4s, v9.4s\n"
-    "ldr s22, [x24, x22]\n"
-    "fmla v1.4s, v13.4s, v10.4s\n"
-    "ldr s23, [x26, x27]\n"
-    "fmla v3.4s, v14.4s, v7.4s\n"
-    "ldr s18, [x28, x23]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "ldr s13, [x25, x22]\n"
-    "fmla v1.4s, v14.4s, v12.4s\n"
-    "ldr s14, [x26, x23]\n"
-    "fmla v2.4s, v15.4s, v10.4s\n"
-    "ldr s17, [x28, x22]\n"
-    "fmla v0.4s, v19.4s, v11.4s\n"
-    "ldr s15, [x26, x22]\n"
-    "fmla v1.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v21.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v1.4s, v19.4s, v6.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v2.4s, v21.4s, v8.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v2.4s, v22.4s, v6.4s\n"
-    "movi v20.16b, #0\n"
-    "fmla v0.4s, v18.4s, v10.4s\n"
-    "fmax v3.4s, v3.4s, v20.4s\n"
-    "fmla v2.4s, v13.4s, v7.4s\n"
-    "fmax v1.4s, v1.4s, v20.4s\n"
-    "str s3, [%[outptr0]]\n"
-    "fmla v0.4s, v13.4s, v12.4s\n"
-    "str s1, [x19]\n"
-    "fmax v2.4s, v2.4s, v20.4s\n"
-    "fmla v0.4s, v14.4s, v8.4s\n"
-    "str s2, [%[outptr0], %[output_col_stride1]]\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v0.4s, v17.4s, v6.4s\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmax v0.4s, v0.4s, v20.4s\n"
-    "str s0, [x19, %[output_col_stride1]]\n"
-    "add x19, x19, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
-    : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  __asm __volatile(
-    "mov x22, xzr\n"
-    "mov x26, xzr\n"
-    "and x23, %[n_channels], #3\n"
-    "lsr x24, %[n_channels], #2\n"
-    "cbz x24, 4f\n"
-    "1:\n"
-    "ldr q14, [%[wbptr]]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr q13, [%[wbptr], #16]\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "mov v2.16b, v14.16b\n"
-    "ldr q4, [%[wbptr], #48]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr q12, [%[wbptr], #64]\n"
-    "ldr q9, [%[wbptr], #80]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr q8, [%[wbptr], #96]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr q7, [%[wbptr], #112]\n"
-    "ldr x25, [%[inptrs], 120]\n"
-    "ldr q6, [%[wbptr], #128]\n"
-    "subs x24, x24, #1\n"
-    "ldr q5, [%[wbptr], #144]\n"
-    "ldr q15, [x19, x22]\n"
-    "fmla v3.4s, v15.4s, v13.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "ldr q16, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr q15, [x25, x22]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "ldr q10, [x19, x22]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "ldr q14, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "ldr q18, [x19, x22]\n"
-    "fmla v3.4s, v10.4s, v11.4s\n"
-    "fmla v3.4s, v16.4s, v7.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v1.4s, v16.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v3.4s, v17.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v2.4s, v18.4s, v13.4s\n"
-    "ldr q16, [x20, x22]\n"
-    "movi v10.16b, #0\n"
-    "ldr q17, [x19, x22]\n"
-    "fmla v1.4s, v15.4s, v12.4s\n"
-    "ldr x27, [%[inptrs], 160]\n"
-    "fmla v3.4s, v18.4s, v4.4s\n"
-    "ldr x25, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr q18, [x27, x22]\n"
-    "ldr q15, [x25, x22]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "fmla v1.4s, v14.4s, v11.4s\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "fmla v3.4s, v14.4s, v6.4s\n"
-    "ldr q14, [x21, x22]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "fmla v0.4s, v14.4s, v13.4s\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v1.4s, v18.4s, v7.4s\n"
-    "ldr x27, [%[inptrs], 168]\n"
-    "fmla v3.4s, v16.4s, v8.4s\n"
-    "ldr q18, [x19, x22]\n"
-    "fmla v2.4s, v14.4s, v7.4s\n"
-    "ldr q13, [x27, x22]\n"
-    "ldr x25, [%[inptrs], 136]\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v1.4s, v15.4s, v9.4s\n"
-    "ldr x27, [%[inptrs], 176]\n"
-    "fmla v3.4s, v14.4s, v5.4s\n"
-    "ldr q16, [x25, x22]\n"
-    "fmla v2.4s, v17.4s, v9.4s\n"
-    "ldr q17, [x21, x22]\n"
-    "fmla v0.4s, v16.4s, v12.4s\n"
-    "ldr q12, [x20, x22]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr q15, [x27, x22]\n"
-    "fmax v3.4s, v3.4s, v10.4s\n"
-    "ldr x25, [%[inptrs], 144]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v0.4s, v17.4s, v11.4s\n"
-    "ldr q14, [x25, x22]\n"
-    "fmla v1.4s, v13.4s, v6.4s\n"
-    "ldr q11, [x21, x22]\n"
-    "ldr x27, [%[inptrs], 184]\n"
-    "ldr x25, [%[inptrs], 152]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str q3, [x21, x26]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmla v1.4s, v16.4s, v8.4s\n"
-    "ldr q18, [x27, x22]\n"
-    "ldr q17, [x25, x22]\n"
-    "ldr x27, [%[inptrs], 192]\n"
-    "fmla v2.4s, v12.4s, v8.4s\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v0.4s, v14.4s, v9.4s\n"
-    "ldr q16, [x27, x22]\n"
-    "fmla v1.4s, v15.4s, v5.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "ldr q14, [%[wbptr]]\n"
-    "add x22, x22, #16\n"
-    "fmla v2.4s, v11.4s, v5.4s\n"
-    "ldr q13, [%[wbptr], #16]\n"
-    "fmla v0.4s, v11.4s, v4.4s\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "fmax v1.4s, v1.4s, v10.4s\n"
-    "ldr q12, [%[wbptr], #64]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr q9, [%[wbptr], #80]\n"
-    "fmax v2.4s, v2.4s, v10.4s\n"
-    "ldr q7, [%[wbptr], #112]\n"
-    "str q1, [x28, x26]\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr q4, [%[wbptr], #48]\n"
-    "str q2, [x21, x26]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "mov v2.16b, v14.16b\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v17.4s, v8.4s\n"
-    "ldr q6, [%[wbptr], #128]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr x25, [%[inptrs], 120]\n"
-    "subs x24, x24, #1\n"
-    "ldr q15, [x19, x22]\n"
-    "fmla v0.4s, v16.4s, v5.4s\n"
-    "ldr q8, [%[wbptr], #96]\n"
-    "fmla v3.4s, v15.4s, v13.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "ldr q16, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr q15, [x25, x22]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "fmax v0.4s, v0.4s, v10.4s\n"
-    "ldr q5, [%[wbptr], #144]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr q10, [x19, x22]\n"
-    "ldr q17, [x20, x22]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "str q0, [x28, x26]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr q18, [x19, x22]\n"
-    "fmla v3.4s, v10.4s, v11.4s\n"
-    "ldr q14, [x21, x22]\n"
-    "add x26, x26, #16\n"
-    "fmla v3.4s, v16.4s, v7.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v1.4s, v16.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v3.4s, v17.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v2.4s, v18.4s, v13.4s\n"
-    "ldr q16, [x20, x22]\n"
-    "movi v10.16b, #0\n"
-    "ldr q17, [x19, x22]\n"
-    "fmla v1.4s, v15.4s, v12.4s\n"
-    "ldr x27, [%[inptrs], 160]\n"
-    "fmla v3.4s, v18.4s, v4.4s\n"
-    "ldr x25, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr q18, [x27, x22]\n"
-    "ldr q15, [x25, x22]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "fmla v1.4s, v14.4s, v11.4s\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "fmla v3.4s, v14.4s, v6.4s\n"
-    "ldr q14, [x21, x22]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr q17, [x20, x22]\n"
-    "fmla v0.4s, v14.4s, v13.4s\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v1.4s, v18.4s, v7.4s\n"
-    "ldr x27, [%[inptrs], 168]\n"
-    "fmla v3.4s, v16.4s, v8.4s\n"
-    "ldr q18, [x19, x22]\n"
-    "fmla v2.4s, v14.4s, v7.4s\n"
-    "ldr q13, [x27, x22]\n"
-    "ldr x25, [%[inptrs], 136]\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v1.4s, v15.4s, v9.4s\n"
-    "ldr x27, [%[inptrs], 176]\n"
-    "fmla v3.4s, v14.4s, v5.4s\n"
-    "ldr q16, [x25, x22]\n"
-    "fmla v2.4s, v17.4s, v9.4s\n"
-    "ldr q17, [x21, x22]\n"
-    "fmla v0.4s, v16.4s, v12.4s\n"
-    "ldr q12, [x20, x22]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr q15, [x27, x22]\n"
-    "fmax v3.4s, v3.4s, v10.4s\n"
-    "ldr x25, [%[inptrs], 144]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v0.4s, v17.4s, v11.4s\n"
-    "ldr q14, [x25, x22]\n"
-    "fmla v1.4s, v13.4s, v6.4s\n"
-    "ldr q11, [x21, x22]\n"
-    "ldr x27, [%[inptrs], 184]\n"
-    "ldr x25, [%[inptrs], 152]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str q3, [x21, x26]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmla v1.4s, v16.4s, v8.4s\n"
-    "ldr q18, [x27, x22]\n"
-    "ldr q17, [x25, x22]\n"
-    "ldr x27, [%[inptrs], 192]\n"
-    "fmla v2.4s, v12.4s, v8.4s\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v0.4s, v14.4s, v9.4s\n"
-    "ldr q16, [x27, x22]\n"
-    "fmla v1.4s, v15.4s, v5.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "add x22, x22, #16\n"
-    "fmla v2.4s, v11.4s, v5.4s\n"
-    "fmla v0.4s, v11.4s, v4.4s\n"
-    "fmax v1.4s, v1.4s, v10.4s\n"
-    "fmax v2.4s, v2.4s, v10.4s\n"
-    "str q1, [x28, x26]\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "str q2, [x21, x26]\n"
-    "fmla v0.4s, v17.4s, v8.4s\n"
-    "fmla v0.4s, v16.4s, v5.4s\n"
-    "fmax v0.4s, v0.4s, v10.4s\n"
-    "str q0, [x28, x26]\n"
-    "add x26, x26, #16\n"
-    "4:\n"
-    "cbz x23, 7f\n"
-    "ldr s14, [%[wbptr]]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr s13, [%[wbptr], #4]\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "mov v2.16b, v14.16b\n"
-    "ldr s4, [%[wbptr], #12]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr s12, [%[wbptr], #16]\n"
-    "ldr s9, [%[wbptr], #20]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr s8, [%[wbptr], #24]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr s7, [%[wbptr], #28]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr s6, [%[wbptr], #32]\n"
-    "ldr x25, [%[inptrs], 120]\n"
-    "ldr s5, [%[wbptr], #36]\n"
-    "subs x23, x23, #1\n"
-    "ldr s15, [x19, x22]\n"
-    "ldr s17, [x20, x22]\n"
-    "fmla v3.4s, v15.4s, v13.4s\n"
-    "ldr s16, [x21, x22]\n"
-    "ldr s15, [x25, x22]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "ldr s10, [x19, x22]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr s17, [x20, x22]\n"
-    "ldr s14, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "ldr s18, [x19, x22]\n"
-    "fmla v3.4s, v10.4s, v11.4s\n"
-    "fmla v3.4s, v16.4s, v7.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v1.4s, v16.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v3.4s, v17.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v2.4s, v18.4s, v13.4s\n"
-    "ldr s16, [x20, x22]\n"
-    "movi v10.16b, #0\n"
-    "ldr s17, [x19, x22]\n"
-    "fmla v1.4s, v15.4s, v12.4s\n"
-    "ldr x27, [%[inptrs], 160]\n"
-    "fmla v3.4s, v18.4s, v4.4s\n"
-    "ldr x25, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr s18, [x27, x22]\n"
-    "ldr s15, [x25, x22]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "fmla v1.4s, v14.4s, v11.4s\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "fmla v3.4s, v14.4s, v6.4s\n"
-    "ldr s14, [x21, x22]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr s17, [x20, x22]\n"
-    "fmla v0.4s, v14.4s, v13.4s\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v1.4s, v18.4s, v7.4s\n"
-    "ldr x27, [%[inptrs], 168]\n"
-    "fmla v3.4s, v16.4s, v8.4s\n"
-    "ldr s18, [x19, x22]\n"
-    "fmla v2.4s, v14.4s, v7.4s\n"
-    "ldr s13, [x27, x22]\n"
-    "ldr x25, [%[inptrs], 136]\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v1.4s, v15.4s, v9.4s\n"
-    "ldr x27, [%[inptrs], 176]\n"
-    "fmla v3.4s, v14.4s, v5.4s\n"
-    "ldr s16, [x25, x22]\n"
-    "fmla v2.4s, v17.4s, v9.4s\n"
-    "ldr s17, [x21, x22]\n"
-    "fmla v0.4s, v16.4s, v12.4s\n"
-    "ldr s12, [x20, x22]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr s15, [x27, x22]\n"
-    "fmax v3.4s, v3.4s, v10.4s\n"
-    "ldr x25, [%[inptrs], 144]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v0.4s, v17.4s, v11.4s\n"
-    "ldr s14, [x25, x22]\n"
-    "fmla v1.4s, v13.4s, v6.4s\n"
-    "ldr s11, [x21, x22]\n"
-    "ldr x27, [%[inptrs], 184]\n"
-    "ldr x25, [%[inptrs], 152]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str s3, [x21, x26]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmla v1.4s, v16.4s, v8.4s\n"
-    "ldr s18, [x27, x22]\n"
-    "ldr s17, [x25, x22]\n"
-    "ldr x27, [%[inptrs], 192]\n"
-    "fmla v2.4s, v12.4s, v8.4s\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v0.4s, v14.4s, v9.4s\n"
-    "ldr s16, [x27, x22]\n"
-    "fmla v1.4s, v15.4s, v5.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "ldr s14, [%[wbptr]]\n"
-    "add x22, x22, #4\n"
-    "fmla v2.4s, v11.4s, v5.4s\n"
-    "ldr s13, [%[wbptr], #4]\n"
-    "fmla v0.4s, v11.4s, v4.4s\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "fmax v1.4s, v1.4s, v10.4s\n"
-    "ldr s12, [%[wbptr], #16]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr s9, [%[wbptr], #20]\n"
-    "fmax v2.4s, v2.4s, v10.4s\n"
-    "ldr s7, [%[wbptr], #28]\n"
-    "str s1, [x28, x26]\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr s4, [%[wbptr], #12]\n"
-    "str s2, [x21, x26]\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "mov v2.16b, v14.16b\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v0.4s, v17.4s, v8.4s\n"
-    "ldr s6, [%[wbptr], #32]\n"
-    "ldr x19, [%[inptrs], 0]\n"
-    "ldr x20, [%[inptrs], 40]\n"
-    "ldr x21, [%[inptrs], 80]\n"
-    "ldr x25, [%[inptrs], 120]\n"
-    "subs x23, x23, #1\n"
-    "ldr s15, [x19, x22]\n"
-    "fmla v0.4s, v16.4s, v5.4s\n"
-    "ldr s8, [%[wbptr], #24]\n"
-    "fmla v3.4s, v15.4s, v13.4s\n"
-    "ldr s17, [x20, x22]\n"
-    "ldr s16, [x21, x22]\n"
-    "ldr x19, [%[inptrs], 8]\n"
-    "ldr s15, [x25, x22]\n"
-    "ldr x20, [%[inptrs], 48]\n"
-    "fmax v0.4s, v0.4s, v10.4s\n"
-    "ldr s5, [%[wbptr], #36]\n"
-    "fmla v3.4s, v17.4s, v12.4s\n"
-    "ldr s10, [x19, x22]\n"
-    "ldr s17, [x20, x22]\n"
-    "ldr x19, [%[inptrs], 16]\n"
-    "str s0, [x28, x26]\n"
-    "ldr x21, [%[inptrs], 88]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr s18, [x19, x22]\n"
-    "fmla v3.4s, v10.4s, v11.4s\n"
-    "ldr s14, [x21, x22]\n"
-    "add x26, x26, #4\n"
-    "fmla v3.4s, v16.4s, v7.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v1.4s, v16.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 56]\n"
-    "fmla v3.4s, v17.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 24]\n"
-    "fmla v2.4s, v18.4s, v13.4s\n"
-    "ldr s16, [x20, x22]\n"
-    "movi v10.16b, #0\n"
-    "ldr s17, [x19, x22]\n"
-    "fmla v1.4s, v15.4s, v12.4s\n"
-    "ldr x27, [%[inptrs], 160]\n"
-    "fmla v3.4s, v18.4s, v4.4s\n"
-    "ldr x25, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v12.4s\n"
-    "ldr s18, [x27, x22]\n"
-    "ldr s15, [x25, x22]\n"
-    "ldr x21, [%[inptrs], 96]\n"
-    "fmla v1.4s, v14.4s, v11.4s\n"
-    "ldr x20, [%[inptrs], 64]\n"
-    "fmla v3.4s, v14.4s, v6.4s\n"
-    "ldr s14, [x21, x22]\n"
-    "fmla v2.4s, v17.4s, v11.4s\n"
-    "ldr s17, [x20, x22]\n"
-    "fmla v0.4s, v14.4s, v13.4s\n"
-    "ldr x19, [%[inptrs], 32]\n"
-    "fmla v1.4s, v18.4s, v7.4s\n"
-    "ldr x27, [%[inptrs], 168]\n"
-    "fmla v3.4s, v16.4s, v8.4s\n"
-    "ldr s18, [x19, x22]\n"
-    "fmla v2.4s, v14.4s, v7.4s\n"
-    "ldr s13, [x27, x22]\n"
-    "ldr x25, [%[inptrs], 136]\n"
-    "ldr x21, [%[inptrs], 104]\n"
-    "ldr x20, [%[inptrs], 72]\n"
-    "fmla v1.4s, v15.4s, v9.4s\n"
-    "ldr x27, [%[inptrs], 176]\n"
-    "fmla v3.4s, v14.4s, v5.4s\n"
-    "ldr s16, [x25, x22]\n"
-    "fmla v2.4s, v17.4s, v9.4s\n"
-    "ldr s17, [x21, x22]\n"
-    "fmla v0.4s, v16.4s, v12.4s\n"
-    "ldr s12, [x20, x22]\n"
-    "fmla v1.4s, v14.4s, v4.4s\n"
-    "ldr s15, [x27, x22]\n"
-    "fmax v3.4s, v3.4s, v10.4s\n"
-    "ldr x25, [%[inptrs], 144]\n"
-    "fmla v2.4s, v18.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 112]\n"
-    "fmla v0.4s, v17.4s, v11.4s\n"
-    "ldr s14, [x25, x22]\n"
-    "fmla v1.4s, v13.4s, v6.4s\n"
-    "ldr s11, [x21, x22]\n"
-    "ldr x27, [%[inptrs], 184]\n"
-    "ldr x25, [%[inptrs], 152]\n"
-    "ldr x21, [%[outptrs], 0]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 16]\n"
-    "str s3, [x21, x26]\n"
-    "fmla v0.4s, v15.4s, v7.4s\n"
-    "fmla v1.4s, v16.4s, v8.4s\n"
-    "ldr s18, [x27, x22]\n"
-    "ldr s17, [x25, x22]\n"
-    "ldr x27, [%[inptrs], 192]\n"
-    "fmla v2.4s, v12.4s, v8.4s\n"
-    "ldr x21, [%[outptrs], 8]\n"
-    "fmla v0.4s, v14.4s, v9.4s\n"
-    "ldr s16, [x27, x22]\n"
-    "fmla v1.4s, v15.4s, v5.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "add x22, x22, #4\n"
-    "fmla v2.4s, v11.4s, v5.4s\n"
-    "fmla v0.4s, v11.4s, v4.4s\n"
-    "fmax v1.4s, v1.4s, v10.4s\n"
-    "fmax v2.4s, v2.4s, v10.4s\n"
-    "str s1, [x28, x26]\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "ldr x28, [%[outptrs], 24]\n"
-    "str s2, [x21, x26]\n"
-    "fmla v0.4s, v17.4s, v8.4s\n"
-    "fmla v0.4s, v16.4s, v5.4s\n"
-    "fmax v0.4s, v0.4s, v10.4s\n"
-    "str s0, [x28, x26]\n"
-    "add x26, x26, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x21, %[inptr0], %[input_row_stride]\n"
-    "add x23, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x24, %[outptr0], %[output_row_stride]\n"
-    "add x28, x21, %[input_row_stride]\n"
-    "add x26, x23, %[input_col_stride1]\n"
-    "and x19, %[n_channels], #3\n"
-    "add x27, x28, %[input_row_stride]\n"
-    "add x25, x26, %[input_col_stride1]\n"
-    "lsr x20, %[n_channels], #2\n"
-    "add x22, x27, %[input_row_stride]\n"
-    "cbz x20, 4f\n"
-    "1:\n"
-    "ldr q14, [%[wbptr]]\n"
-    "subs x20, x20, #1\n"
-    "mov v5.16b, v14.16b\n"
-    "ldr q0, [%[wbptr], #16]\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr q1, [%[wbptr], #32]\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr q2, [%[wbptr], #48]\n"
-    "mov v10.16b, v14.16b\n"
-    "ldr q6, [%[wbptr], #64]\n"
-    "ldr q3, [%[wbptr], #80]\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "ldr q4, [%[wbptr], #112]\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "ldr q9, [%[wbptr], #144]\n"
-    "ldr q19, [%[inptr0]]\n"
-    "fmla v5.4s, v19.4s, v0.4s\n"
-    "ldr q15, [x21]\n"
-    "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q16, [x28]\n"
-    "fmla v11.4s, v16.4s, v0.4s\n"
-    "ldr q23, [x21, %[input_col_stride1]]\n"
-    "fmla v5.4s, v15.4s, v6.4s\n"
-    "ldr q18, [%[inptr0], x23]\n"
-    "ldr q17, [x27]\n"
-    "ldr q13, [x28, %[input_col_stride1]]\n"
-    "fmla v5.4s, v21.4s, v1.4s\n"
-    "fmla v5.4s, v16.4s, v4.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v5.4s, v23.4s, v3.4s\n"
-    "ldr q21, [x21, x23]\n"
-    "fmla v12.4s, v18.4s, v0.4s\n"
-    "ldr q20, [%[inptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v6.4s\n"
-    "ldr q19, [x22]\n"
-    "fmla v5.4s, v18.4s, v2.4s\n"
-    "ldr q15, [x27, %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v6.4s\n"
-    "ldr q16, [x28, x23]\n"
-    "fmla v11.4s, v13.4s, v1.4s\n"
-    "ldr q17, [x21, x26]\n"
-    "fmla v5.4s, v13.4s, v8.4s\n"
-    "ldr q14, [%[inptr0], x25]\n"
-    "fmla v12.4s, v20.4s, v1.4s\n"
-    "ldr q20, [x22, %[input_col_stride1]]\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr q19, [x27, x23]\n"
-    "fmla v5.4s, v21.4s, v7.4s\n"
-    "ldr q22, [x28, x26]\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "ldr q21, [x21, x25]\n"
-    "fmla v11.4s, v15.4s, v3.4s\n"
-    "ldr q23, [x22, x23]\n"
-    "fmla v5.4s, v16.4s, v9.4s\n"
-    "ldr q18, [x27, x26]\n"
-    "fmla v10.4s, v16.4s, v0.4s\n"
-    "ldr q15, [x28, x25]\n"
-    "fmla v11.4s, v16.4s, v2.4s\n"
-    "ldr q16, [x22, x26]\n"
-    "fmla v12.4s, v17.4s, v3.4s\n"
-    "ldr q17, [x27, x25]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr q13, [x22, x25]\n"
-    "fmla v11.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v12.4s, v14.4s, v2.4s\n"
-    "ldr q14, [%[wbptr]]\n"
-    "fmla v10.4s, v22.4s, v1.4s\n"
-    "ldr q0, [%[wbptr], #16]\n"
-    "fmla v11.4s, v19.4s, v7.4s\n"
-    "ldr q6, [%[wbptr], #64]\n"
-    "fmla v12.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v10.4s, v23.4s, v4.4s\n"
-    "ldr q1, [%[wbptr], #32]\n"
-    "fmla v11.4s, v23.4s, v9.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v12.4s, v21.4s, v7.4s\n"
-    "ldr q19, [%[inptr0]]\n"
-    "fmla v10.4s, v18.4s, v3.4s\n"
-    "ldr q4, [%[wbptr], #112]\n"
-    "movi v20.16b, #0\n"
-    "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v12.4s, v15.4s, v9.4s\n"
-    "ldr q18, [%[inptr0], x23]\n"
-    "fmla v10.4s, v15.4s, v2.4s\n"
-    "ldr q3, [%[wbptr], #80]\n"
-    "fmov v22.4s, #6.0\n"
-    "add x21, x21, #16\n"
-    "fmax v5.4s, v5.4s, v20.4s\n"
-    "ldr q15, [x21]\n"
-    "fmla v10.4s, v16.4s, v8.4s\n"
-    "ldr q2, [%[wbptr], #48]\n"
-    "fmin v5.4s, v5.4s, v22.4s\n"
-    "ldr q23, [x21, %[input_col_stride1]]\n"
-    "fmax v12.4s, v12.4s, v20.4s\n"
-    "add x28, x28, #16\n"
-    "str q5, [%[outptr0]]\n"
-    "fmla v10.4s, v17.4s, v7.4s\n"
-    "fmin v12.4s, v12.4s, v22.4s\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "fmax v11.4s, v11.4s, v20.4s\n"
-    "ldr q16, [x28]\n"
-    "str q12, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v13.4s, v9.4s\n"
-    "fmin v11.4s, v11.4s, v22.4s\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "mov v5.16b, v14.16b\n"
-    "ldr q13, [x28, %[input_col_stride1]]\n"
-    "str q11, [x24]\n"
-    "fmax v10.4s, v10.4s, v20.4s\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr q9, [%[wbptr], #144]\n"
-    "fmin v10.4s, v10.4s, v22.4s\n"
-    "add x27, x27, #16\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr q17, [x27]\n"
-    "str q10, [x24, %[output_col_stride1]]\n"
-    "fmla v5.4s, v19.4s, v0.4s\n"
-    "mov v10.16b, v14.16b\n"
-    "add x22, x22, #16\n"
-    "fmla v11.4s, v16.4s, v0.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v5.4s, v15.4s, v6.4s\n"
-    "add x24, x24, #16\n"
-    "subs x20, x20, #1\n"
-    "fmla v5.4s, v21.4s, v1.4s\n"
-    "fmla v5.4s, v16.4s, v4.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v5.4s, v23.4s, v3.4s\n"
-    "ldr q21, [x21, x23]\n"
-    "fmla v12.4s, v18.4s, v0.4s\n"
-    "ldr q20, [%[inptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v6.4s\n"
-    "ldr q19, [x22]\n"
-    "fmla v5.4s, v18.4s, v2.4s\n"
-    "ldr q15, [x27, %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v6.4s\n"
-    "ldr q16, [x28, x23]\n"
-    "fmla v11.4s, v13.4s, v1.4s\n"
-    "ldr q17, [x21, x26]\n"
-    "fmla v5.4s, v13.4s, v8.4s\n"
-    "ldr q14, [%[inptr0], x25]\n"
-    "fmla v12.4s, v20.4s, v1.4s\n"
-    "ldr q20, [x22, %[input_col_stride1]]\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr q19, [x27, x23]\n"
-    "fmla v5.4s, v21.4s, v7.4s\n"
-    "ldr q22, [x28, x26]\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "ldr q21, [x21, x25]\n"
-    "fmla v11.4s, v15.4s, v3.4s\n"
-    "ldr q23, [x22, x23]\n"
-    "fmla v5.4s, v16.4s, v9.4s\n"
-    "ldr q18, [x27, x26]\n"
-    "fmla v10.4s, v16.4s, v0.4s\n"
-    "ldr q15, [x28, x25]\n"
-    "fmla v11.4s, v16.4s, v2.4s\n"
-    "ldr q16, [x22, x26]\n"
-    "fmla v12.4s, v17.4s, v3.4s\n"
-    "ldr q17, [x27, x25]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr q13, [x22, x25]\n"
-    "fmla v11.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v12.4s, v14.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v10.4s, v22.4s, v1.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v11.4s, v19.4s, v7.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v12.4s, v22.4s, v8.4s\n"
-    "add x28, x28, #16\n"
-    "fmla v10.4s, v23.4s, v4.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v11.4s, v23.4s, v9.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v12.4s, v21.4s, v7.4s\n"
-    "movi v20.16b, #0\n"
-    "fmla v10.4s, v18.4s, v3.4s\n"
-    "fmov v22.4s, #6.0\n"
-    "fmax v5.4s, v5.4s, v20.4s\n"
-    "fmax v11.4s, v11.4s, v20.4s\n"
-    "fmla v12.4s, v15.4s, v9.4s\n"
-    "fmla v10.4s, v15.4s, v2.4s\n"
-    "fmin v5.4s, v5.4s, v22.4s\n"
-    "fmin v11.4s, v11.4s, v22.4s\n"
-    "fmax v12.4s, v12.4s, v20.4s\n"
-    "str q5, [%[outptr0]]\n"
-    "str q11, [x24]\n"
-    "fmla v10.4s, v16.4s, v8.4s\n"
-    "fmin v12.4s, v12.4s, v22.4s\n"
-    "str q12, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v17.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v10.4s, v13.4s, v9.4s\n"
-    "fmax v10.4s, v10.4s, v20.4s\n"
-    "fmin v10.4s, v10.4s, v22.4s\n"
-    "str q10, [x24, %[output_col_stride1]]\n"
-    "add x24, x24, #16\n"
-    "4:\n"
-    "cbz x19, 7f\n"
-    "ldr s14, [%[wbptr]]\n"
-    "mov v5.16b, v14.16b\n"
-    "ldr s0, [%[wbptr], #4]\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr s1, [%[wbptr], #8]\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr s2, [%[wbptr], #12]\n"
-    "mov v10.16b, v14.16b\n"
-    "ldr s6, [%[wbptr], #16]\n"
-    "ldr s3, [%[wbptr], #20]\n"
-    "subs x19, x19, #1\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "ldr s4, [%[wbptr], #28]\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "ldr s9, [%[wbptr], #36]\n"
-    "ldr s19, [%[inptr0]]\n"
-    "ldr s15, [x21]\n"
-    "fmla v5.4s, v19.4s, v0.4s\n"
-    "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s16, [x28]\n"
-    "ldr s23, [x21, %[input_col_stride1]]\n"
-    "fmla v11.4s, v16.4s, v0.4s\n"
-    "ldr s18, [%[inptr0], x23]\n"
-    "fmla v5.4s, v15.4s, v6.4s\n"
-    "ldr s17, [x27]\n"
-    "ldr s13, [x28, %[input_col_stride1]]\n"
-    "fmla v5.4s, v21.4s, v1.4s\n"
-    "fmla v5.4s, v16.4s, v4.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v5.4s, v23.4s, v3.4s\n"
-    "ldr s21, [x21, x23]\n"
-    "fmla v12.4s, v18.4s, v0.4s\n"
-    "ldr s20, [%[inptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v6.4s\n"
-    "ldr s19, [x22]\n"
-    "fmla v5.4s, v18.4s, v2.4s\n"
-    "ldr s15, [x27, %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v6.4s\n"
-    "ldr s16, [x28, x23]\n"
-    "fmla v11.4s, v13.4s, v1.4s\n"
-    "ldr s17, [x21, x26]\n"
-    "fmla v5.4s, v13.4s, v8.4s\n"
-    "ldr s14, [%[inptr0], x25]\n"
-    "fmla v12.4s, v20.4s, v1.4s\n"
-    "ldr s20, [x22, %[input_col_stride1]]\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr s19, [x27, x23]\n"
-    "fmla v5.4s, v21.4s, v7.4s\n"
-    "ldr s22, [x28, x26]\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "ldr s21, [x21, x25]\n"
-    "fmla v11.4s, v15.4s, v3.4s\n"
-    "ldr s23, [x22, x23]\n"
-    "fmla v5.4s, v16.4s, v9.4s\n"
-    "ldr s18, [x27, x26]\n"
-    "fmla v10.4s, v16.4s, v0.4s\n"
-    "ldr s15, [x28, x25]\n"
-    "fmla v11.4s, v16.4s, v2.4s\n"
-    "ldr s16, [x22, x26]\n"
-    "fmla v12.4s, v17.4s, v3.4s\n"
-    "ldr s17, [x27, x25]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr s13, [x22, x25]\n"
-    "fmla v11.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v12.4s, v14.4s, v2.4s\n"
-    "ldr s14, [%[wbptr]]\n"
-    "fmla v10.4s, v22.4s, v1.4s\n"
-    "ldr s0, [%[wbptr], #4]\n"
-    "fmla v11.4s, v19.4s, v7.4s\n"
-    "ldr s6, [%[wbptr], #16]\n"
-    "fmla v12.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v10.4s, v23.4s, v4.4s\n"
-    "ldr s1, [%[wbptr], #8]\n"
-    "fmla v11.4s, v23.4s, v9.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v12.4s, v21.4s, v7.4s\n"
-    "ldr s19, [%[inptr0]]\n"
-    "fmla v10.4s, v18.4s, v3.4s\n"
-    "ldr s4, [%[wbptr], #28]\n"
-    "movi v20.16b, #0\n"
-    "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v12.4s, v15.4s, v9.4s\n"
-    "ldr s18, [%[inptr0], x23]\n"
-    "fmla v10.4s, v15.4s, v2.4s\n"
-    "ldr s3, [%[wbptr], #20]\n"
-    "fmov v22.4s, #6.0\n"
-    "add x21, x21, #4\n"
-    "fmax v5.4s, v5.4s, v20.4s\n"
-    "ldr s15, [x21]\n"
-    "fmla v10.4s, v16.4s, v8.4s\n"
-    "ldr s2, [%[wbptr], #12]\n"
-    "fmin v5.4s, v5.4s, v22.4s\n"
-    "ldr s23, [x21, %[input_col_stride1]]\n"
-    "fmax v12.4s, v12.4s, v20.4s\n"
-    "add x28, x28, #4\n"
-    "str s5, [%[outptr0]]\n"
-    "fmla v10.4s, v17.4s, v7.4s\n"
-    "fmin v12.4s, v12.4s, v22.4s\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "fmax v11.4s, v11.4s, v20.4s\n"
-    "ldr s16, [x28]\n"
-    "str s12, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v13.4s, v9.4s\n"
-    "fmin v11.4s, v11.4s, v22.4s\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "mov v5.16b, v14.16b\n"
-    "ldr s13, [x28, %[input_col_stride1]]\n"
-    "str s11, [x24]\n"
-    "fmax v10.4s, v10.4s, v20.4s\n"
-    "mov v11.16b, v14.16b\n"
-    "ldr s9, [%[wbptr], #36]\n"
-    "fmin v10.4s, v10.4s, v22.4s\n"
-    "add x27, x27, #4\n"
-    "mov v12.16b, v14.16b\n"
-    "ldr s17, [x27]\n"
-    "str s10, [x24, %[output_col_stride1]]\n"
-    "fmla v5.4s, v19.4s, v0.4s\n"
-    "mov v10.16b, v14.16b\n"
-    "add x22, x22, #4\n"
-    "fmla v11.4s, v16.4s, v0.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v5.4s, v15.4s, v6.4s\n"
-    "add x24, x24, #4\n"
-    "subs x19, x19, #1\n"
-    "fmla v5.4s, v21.4s, v1.4s\n"
-    "fmla v5.4s, v16.4s, v4.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v5.4s, v23.4s, v3.4s\n"
-    "ldr s21, [x21, x23]\n"
-    "fmla v12.4s, v18.4s, v0.4s\n"
-    "ldr s20, [%[inptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v6.4s\n"
-    "ldr s19, [x22]\n"
-    "fmla v5.4s, v18.4s, v2.4s\n"
-    "ldr s15, [x27, %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v6.4s\n"
-    "ldr s16, [x28, x23]\n"
-    "fmla v11.4s, v13.4s, v1.4s\n"
-    "ldr s17, [x21, x26]\n"
-    "fmla v5.4s, v13.4s, v8.4s\n"
-    "ldr s14, [%[inptr0], x25]\n"
-    "fmla v12.4s, v20.4s, v1.4s\n"
-    "ldr s20, [x22, %[input_col_stride1]]\n"
-    "fmla v11.4s, v19.4s, v4.4s\n"
-    "ldr s19, [x27, x23]\n"
-    "fmla v5.4s, v21.4s, v7.4s\n"
-    "ldr s22, [x28, x26]\n"
-    "fmla v12.4s, v16.4s, v4.4s\n"
-    "ldr s21, [x21, x25]\n"
-    "fmla v11.4s, v15.4s, v3.4s\n"
-    "ldr s23, [x22, x23]\n"
-    "fmla v5.4s, v16.4s, v9.4s\n"
-    "ldr s18, [x27, x26]\n"
-    "fmla v10.4s, v16.4s, v0.4s\n"
-    "ldr s15, [x28, x25]\n"
-    "fmla v11.4s, v16.4s, v2.4s\n"
-    "ldr s16, [x22, x26]\n"
-    "fmla v12.4s, v17.4s, v3.4s\n"
-    "ldr s17, [x27, x25]\n"
-    "fmla v10.4s, v19.4s, v6.4s\n"
-    "ldr s13, [x22, x25]\n"
-    "fmla v11.4s, v20.4s, v8.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v12.4s, v14.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v10.4s, v22.4s, v1.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v11.4s, v19.4s, v7.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v12.4s, v22.4s, v8.4s\n"
-    "add x28, x28, #4\n"
-    "fmla v10.4s, v23.4s, v4.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v11.4s, v23.4s, v9.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v12.4s, v21.4s, v7.4s\n"
-    "movi v20.16b, #0\n"
-    "fmla v10.4s, v18.4s, v3.4s\n"
-    "fmov v22.4s, #6.0\n"
-    "fmax v5.4s, v5.4s, v20.4s\n"
-    "fmax v11.4s, v11.4s, v20.4s\n"
-    "fmla v12.4s, v15.4s, v9.4s\n"
-    "fmla v10.4s, v15.4s, v2.4s\n"
-    "fmin v5.4s, v5.4s, v22.4s\n"
-    "fmin v11.4s, v11.4s, v22.4s\n"
-    "fmax v12.4s, v12.4s, v20.4s\n"
-    "str s5, [%[outptr0]]\n"
-    "str s11, [x24]\n"
-    "fmla v10.4s, v16.4s, v8.4s\n"
-    "fmin v12.4s, v12.4s, v22.4s\n"
-    "str s12, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v17.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v10.4s, v13.4s, v9.4s\n"
-    "fmax v10.4s, v10.4s, v20.4s\n"
-    "fmin v10.4s, v10.4s, v22.4s\n"
-    "str s10, [x24, %[output_col_stride1]]\n"
-    "add x24, x24, #4\n"
-    "7:\n"
-    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
-    : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  __asm __volatile(
-    "mov x27, xzr\n"
-    "mov x28, xzr\n"
-    "and x26, %[n_channels], #3\n"
-    "lsr x25, %[n_channels], #2\n"
-    "cbz x25, 4f\n"
-    "1:\n"
-    "ldr q15, [%[wbptr]]\n"
-    "ldr x21, [%[inptrs], 0]\n"
-    "mov v8.16b, v15.16b\n"
-    "ldr q14, [%[wbptr], #16]\n"
-    "mov v3.16b, v15.16b\n"
-    "ldr q10, [%[wbptr], #32]\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr q7, [%[wbptr], #48]\n"
-    "mov v4.16b, v15.16b\n"
-    "ldr q13, [%[wbptr], #64]\n"
-    "ldr q5, [%[wbptr], #80]\n"
-    "ldr x22, [%[inptrs], 40]\n"
-    "ldr q0, [%[wbptr], #96]\n"
-    "ldr x20, [%[inptrs], 80]\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "ldr x23, [%[inptrs], 120]\n"
-    "ldr q6, [%[wbptr], #128]\n"
-    "subs x25, x25, #1\n"
-    "ldr q1, [%[wbptr], #144]\n"
-    "ldr q17, [x21, x27]\n"
-    "fmla v8.4s, v17.4s, v14.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "ldr q16, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 8]\n"
-    "ldr q17, [x23, x27]\n"
-    "ldr x22, [%[inptrs], 48]\n"
-    "ldr q11, [x21, x27]\n"
-    "ldr x20, [%[inptrs], 88]\n"
-    "fmla v8.4s, v18.4s, v13.4s\n"
-    "ldr q19, [x22, x27]\n"
-    "ldr q15, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 16]\n"
-    "ldr q12, [x21, x27]\n"
-    "fmla v8.4s, v11.4s, v10.4s\n"
-    "fmla v8.4s, v16.4s, v9.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v3.4s, v16.4s, v14.4s\n"
-    "ldr x22, [%[inptrs], 56]\n"
-    "fmla v8.4s, v19.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 24]\n"
-    "fmla v2.4s, v12.4s, v14.4s\n"
-    "ldr q16, [x22, x27]\n"
-    "movi v11.16b, #0\n"
-    "ldr q18, [x21, x27]\n"
-    "fmla v3.4s, v17.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v8.4s, v12.4s, v7.4s\n"
-    "ldr x23, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v13.4s\n"
-    "ldr q19, [x20, x27]\n"
-    "fmov v12.4s, #6.0\n"
-    "ldr q17, [x23, x27]\n"
-    "fmla v3.4s, v15.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 96]\n"
-    "fmla v8.4s, v15.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 64]\n"
-    "fmla v2.4s, v18.4s, v10.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v14.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "ldr x21, [%[inptrs], 32]\n"
-    "fmla v8.4s, v16.4s, v0.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v2.4s, v15.4s, v9.4s\n"
-    "ldr q19, [x21, x27]\n"
-    "ldr q16, [x20, x27]\n"
-    "ldr x23, [%[inptrs], 136]\n"
-    "fmla v3.4s, v17.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 104]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr q14, [x23, x27]\n"
-    "fmla v2.4s, v18.4s, v5.4s\n"
-    "ldr q17, [x20, x27]\n"
-    "fmla v4.4s, v14.4s, v13.4s\n"
-    "ldr x22, [%[inptrs], 72]\n"
-    "fmla v3.4s, v15.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmax v8.4s, v8.4s, v11.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "fmla v2.4s, v19.4s, v7.4s\n"
-    "ldr q13, [x20, x27]\n"
-    "fmla v4.4s, v17.4s, v10.4s\n"
-    "ldr x23, [%[inptrs], 144]\n"
-    "fmla v3.4s, v16.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 112]\n"
-    "fmin v8.4s, v8.4s, v12.4s\n"
-    "ldr q10, [x23, x27]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "fmla v4.4s, v13.4s, v9.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v3.4s, v14.4s, v0.4s\n"
-    "ldr x23, [%[inptrs], 152]\n"
-    "ldr q9, [x20, x27]\n"
-    "ldr x22, [%[outptrs], 0]\n"
-    "fmla v2.4s, v18.4s, v0.4s\n"
-    "ldr q19, [x23, x27]\n"
-    "str q8, [x22, x28]\n"
-    "fmla v4.4s, v10.4s, v5.4s\n"
-    "fmla v3.4s, v13.4s, v1.4s\n"
-    "ldr x20, [%[inptrs], 192]\n"
-    "ldr x22, [%[outptrs], 8]\n"
-    "ldr x24, [%[outptrs], 16]\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v15.4s, v1.4s\n"
-    "ldr q16, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v7.4s\n"
-    "ldr q15, [%[wbptr]]\n"
-    "fmax v3.4s, v3.4s, v11.4s\n"
-    "ldr q14, [%[wbptr], #16]\n"
-    "mov v8.16b, v15.16b\n"
-    "ldr q10, [%[wbptr], #32]\n"
-    "fmax v2.4s, v2.4s, v11.4s\n"
-    "ldr q13, [%[wbptr], #64]\n"
-    "fmla v4.4s, v9.4s, v6.4s\n"
-    "ldr q7, [%[wbptr], #48]\n"
-    "fmin v3.4s, v3.4s, v12.4s\n"
-    "ldr q5, [%[wbptr], #80]\n"
-    "fmin v2.4s, v2.4s, v12.4s\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "add x27, x27, #16\n"
-    "str q3, [x24, x28]\n"
-    "fmla v4.4s, v19.4s, v0.4s\n"
-    "str q2, [x22, x28]\n"
-    "mov v3.16b, v15.16b\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr q6, [%[wbptr], #128]\n"
-    "ldr x24, [%[outptrs], 24]\n"
-    "ldr x21, [%[inptrs], 0]\n"
-    "ldr x22, [%[inptrs], 40]\n"
-    "fmla v4.4s, v16.4s, v1.4s\n"
-    "ldr q0, [%[wbptr], #96]\n"
-    "ldr q17, [x21, x27]\n"
-    "ldr x20, [%[inptrs], 80]\n"
-    "fmla v8.4s, v17.4s, v14.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "ldr q16, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 8]\n"
-    "fmax v4.4s, v4.4s, v11.4s\n"
-    "ldr q1, [%[wbptr], #144]\n"
-    "ldr q11, [x21, x27]\n"
-    "ldr x22, [%[inptrs], 48]\n"
-    "fmla v8.4s, v18.4s, v13.4s\n"
-    "ldr x21, [%[inptrs], 16]\n"
-    "fmin v4.4s, v4.4s, v12.4s\n"
-    "ldr q19, [x22, x27]\n"
-    "ldr q12, [x21, x27]\n"
-    "ldr x23, [%[inptrs], 120]\n"
-    "ldr x20, [%[inptrs], 88]\n"
-    "subs x25, x25, #1\n"
-    "str q4, [x24, x28]\n"
-    "mov v4.16b, v15.16b\n"
-    "ldr q17, [x23, x27]\n"
-    "fmla v8.4s, v11.4s, v10.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "add x28, x28, #16\n"
-    "fmla v8.4s, v16.4s, v9.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v3.4s, v16.4s, v14.4s\n"
-    "ldr x22, [%[inptrs], 56]\n"
-    "fmla v8.4s, v19.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 24]\n"
-    "fmla v2.4s, v12.4s, v14.4s\n"
-    "ldr q16, [x22, x27]\n"
-    "movi v11.16b, #0\n"
-    "ldr q18, [x21, x27]\n"
-    "fmla v3.4s, v17.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v8.4s, v12.4s, v7.4s\n"
-    "ldr x23, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v13.4s\n"
-    "ldr q19, [x20, x27]\n"
-    "fmov v12.4s, #6.0\n"
-    "ldr q17, [x23, x27]\n"
-    "fmla v3.4s, v15.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 96]\n"
-    "fmla v8.4s, v15.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 64]\n"
-    "fmla v2.4s, v18.4s, v10.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v14.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "ldr x21, [%[inptrs], 32]\n"
-    "fmla v8.4s, v16.4s, v0.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v2.4s, v15.4s, v9.4s\n"
-    "ldr q19, [x21, x27]\n"
-    "ldr q16, [x20, x27]\n"
-    "ldr x23, [%[inptrs], 136]\n"
-    "fmla v3.4s, v17.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 104]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr q14, [x23, x27]\n"
-    "fmla v2.4s, v18.4s, v5.4s\n"
-    "ldr q17, [x20, x27]\n"
-    "fmla v4.4s, v14.4s, v13.4s\n"
-    "ldr x22, [%[inptrs], 72]\n"
-    "fmla v3.4s, v15.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmax v8.4s, v8.4s, v11.4s\n"
-    "ldr q18, [x22, x27]\n"
-    "fmla v2.4s, v19.4s, v7.4s\n"
-    "ldr q13, [x20, x27]\n"
-    "fmla v4.4s, v17.4s, v10.4s\n"
-    "ldr x23, [%[inptrs], 144]\n"
-    "fmla v3.4s, v16.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 112]\n"
-    "fmin v8.4s, v8.4s, v12.4s\n"
-    "ldr q10, [x23, x27]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr q15, [x20, x27]\n"
-    "fmla v4.4s, v13.4s, v9.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v3.4s, v14.4s, v0.4s\n"
-    "ldr x23, [%[inptrs], 152]\n"
-    "ldr q9, [x20, x27]\n"
-    "ldr x22, [%[outptrs], 0]\n"
-    "fmla v2.4s, v18.4s, v0.4s\n"
-    "ldr q19, [x23, x27]\n"
-    "str q8, [x22, x28]\n"
-    "fmla v4.4s, v10.4s, v5.4s\n"
-    "fmla v3.4s, v13.4s, v1.4s\n"
-    "ldr x20, [%[inptrs], 192]\n"
-    "ldr x22, [%[outptrs], 8]\n"
-    "ldr x24, [%[outptrs], 16]\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v15.4s, v1.4s\n"
-    "ldr q16, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v7.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmax v3.4s, v3.4s, v11.4s\n"
-    "add x27, x27, #16\n"
-    "fmax v2.4s, v2.4s, v11.4s\n"
-    "fmla v4.4s, v9.4s, v6.4s\n"
-    "fmin v3.4s, v3.4s, v12.4s\n"
-    "fmin v2.4s, v2.4s, v12.4s\n"
-    "str q3, [x24, x28]\n"
-    "fmla v4.4s, v19.4s, v0.4s\n"
-    "str q2, [x22, x28]\n"
-    "ldr x24, [%[outptrs], 24]\n"
-    "fmla v4.4s, v16.4s, v1.4s\n"
-    "fmax v4.4s, v4.4s, v11.4s\n"
-    "fmin v4.4s, v4.4s, v12.4s\n"
-    "str q4, [x24, x28]\n"
-    "add x28, x28, #16\n"
-    "4:\n"
-    "cbz x26, 7f\n"
-    "ldr s15, [%[wbptr]]\n"
-    "mov v8.16b, v15.16b\n"
-    "ldr s14, [%[wbptr], #4]\n"
-    "mov v3.16b, v15.16b\n"
-    "ldr s10, [%[wbptr], #8]\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr s7, [%[wbptr], #12]\n"
-    "mov v4.16b, v15.16b\n"
-    "ldr s13, [%[wbptr], #16]\n"
-    "ldr s5, [%[wbptr], #20]\n"
-    "ldr x21, [%[inptrs], 0]\n"
-    "ldr s0, [%[wbptr], #24]\n"
-    "ldr x22, [%[inptrs], 40]\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "ldr x20, [%[inptrs], 80]\n"
-    "ldr s6, [%[wbptr], #32]\n"
-    "ldr x23, [%[inptrs], 120]\n"
-    "ldr s1, [%[wbptr], #36]\n"
-    "subs x26, x26, #1\n"
-    "ldr s17, [x21, x27]\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v8.4s, v17.4s, v14.4s\n"
-    "ldr s16, [x20, x27]\n"
-    "ldr s17, [x23, x27]\n"
-    "ldr x21, [%[inptrs], 8]\n"
-    "ldr x22, [%[inptrs], 48]\n"
-    "ldr x20, [%[inptrs], 88]\n"
-    "ldr s11, [x21, x27]\n"
-    "fmla v8.4s, v18.4s, v13.4s\n"
-    "ldr s19, [x22, x27]\n"
-    "ldr s15, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 16]\n"
-    "ldr s12, [x21, x27]\n"
-    "fmla v8.4s, v11.4s, v10.4s\n"
-    "fmla v8.4s, v16.4s, v9.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v3.4s, v16.4s, v14.4s\n"
-    "ldr x22, [%[inptrs], 56]\n"
-    "fmla v8.4s, v19.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 24]\n"
-    "fmla v2.4s, v12.4s, v14.4s\n"
-    "ldr s16, [x22, x27]\n"
-    "movi v11.16b, #0\n"
-    "ldr s18, [x21, x27]\n"
-    "fmla v3.4s, v17.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v8.4s, v12.4s, v7.4s\n"
-    "ldr x23, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v13.4s\n"
-    "ldr s19, [x20, x27]\n"
-    "fmov v12.4s, #6.0\n"
-    "ldr s17, [x23, x27]\n"
-    "fmla v3.4s, v15.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 96]\n"
-    "fmla v8.4s, v15.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 64]\n"
-    "fmla v2.4s, v18.4s, v10.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v14.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "ldr x21, [%[inptrs], 32]\n"
-    "fmla v8.4s, v16.4s, v0.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v2.4s, v15.4s, v9.4s\n"
-    "ldr s19, [x21, x27]\n"
-    "ldr s16, [x20, x27]\n"
-    "ldr x23, [%[inptrs], 136]\n"
-    "fmla v3.4s, v17.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 104]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr s14, [x23, x27]\n"
-    "fmla v2.4s, v18.4s, v5.4s\n"
-    "ldr s17, [x20, x27]\n"
-    "fmla v4.4s, v14.4s, v13.4s\n"
-    "ldr x22, [%[inptrs], 72]\n"
-    "fmla v3.4s, v15.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmax v8.4s, v8.4s, v11.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v2.4s, v19.4s, v7.4s\n"
-    "ldr s13, [x20, x27]\n"
-    "fmla v4.4s, v17.4s, v10.4s\n"
-    "ldr x23, [%[inptrs], 144]\n"
-    "fmla v3.4s, v16.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 112]\n"
-    "fmin v8.4s, v8.4s, v12.4s\n"
-    "ldr s10, [x23, x27]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "fmla v4.4s, v13.4s, v9.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v3.4s, v14.4s, v0.4s\n"
-    "ldr x23, [%[inptrs], 152]\n"
-    "ldr s9, [x20, x27]\n"
-    "ldr x22, [%[outptrs], 0]\n"
-    "fmla v2.4s, v18.4s, v0.4s\n"
-    "ldr s19, [x23, x27]\n"
-    "str s8, [x22, x28]\n"
-    "fmla v4.4s, v10.4s, v5.4s\n"
-    "fmla v3.4s, v13.4s, v1.4s\n"
-    "ldr x20, [%[inptrs], 192]\n"
-    "ldr x22, [%[outptrs], 8]\n"
-    "ldr x24, [%[outptrs], 16]\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v15.4s, v1.4s\n"
-    "ldr s16, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v7.4s\n"
-    "ldr s15, [%[wbptr]]\n"
-    "fmax v3.4s, v3.4s, v11.4s\n"
-    "ldr s14, [%[wbptr], #4]\n"
-    "mov v8.16b, v15.16b\n"
-    "ldr s10, [%[wbptr], #8]\n"
-    "fmax v2.4s, v2.4s, v11.4s\n"
-    "ldr s13, [%[wbptr], #16]\n"
-    "fmla v4.4s, v9.4s, v6.4s\n"
-    "ldr s7, [%[wbptr], #12]\n"
-    "fmin v3.4s, v3.4s, v12.4s\n"
-    "ldr s5, [%[wbptr], #20]\n"
-    "fmin v2.4s, v2.4s, v12.4s\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "add x27, x27, #4\n"
-    "str s3, [x24, x28]\n"
-    "fmla v4.4s, v19.4s, v0.4s\n"
-    "str s2, [x22, x28]\n"
-    "mov v3.16b, v15.16b\n"
-    "mov v2.16b, v15.16b\n"
-    "ldr s6, [%[wbptr], #32]\n"
-    "ldr x24, [%[outptrs], 24]\n"
-    "ldr x21, [%[inptrs], 0]\n"
-    "ldr x22, [%[inptrs], 40]\n"
-    "fmla v4.4s, v16.4s, v1.4s\n"
-    "ldr s0, [%[wbptr], #24]\n"
-    "ldr s17, [x21, x27]\n"
-    "ldr x20, [%[inptrs], 80]\n"
-    "fmla v8.4s, v17.4s, v14.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "ldr s16, [x20, x27]\n"
-    "ldr x21, [%[inptrs], 8]\n"
-    "fmax v4.4s, v4.4s, v11.4s\n"
-    "ldr s1, [%[wbptr], #36]\n"
-    "ldr s11, [x21, x27]\n"
-    "ldr x22, [%[inptrs], 48]\n"
-    "fmla v8.4s, v18.4s, v13.4s\n"
-    "ldr x21, [%[inptrs], 16]\n"
-    "fmin v4.4s, v4.4s, v12.4s\n"
-    "ldr s19, [x22, x27]\n"
-    "ldr s12, [x21, x27]\n"
-    "ldr x23, [%[inptrs], 120]\n"
-    "ldr x20, [%[inptrs], 88]\n"
-    "subs x26, x26, #1\n"
-    "str s4, [x24, x28]\n"
-    "mov v4.16b, v15.16b\n"
-    "ldr s17, [x23, x27]\n"
-    "fmla v8.4s, v11.4s, v10.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "add x28, x28, #4\n"
-    "fmla v8.4s, v16.4s, v9.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v3.4s, v16.4s, v14.4s\n"
-    "ldr x22, [%[inptrs], 56]\n"
-    "fmla v8.4s, v19.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 24]\n"
-    "fmla v2.4s, v12.4s, v14.4s\n"
-    "ldr s16, [x22, x27]\n"
-    "movi v11.16b, #0\n"
-    "ldr s18, [x21, x27]\n"
-    "fmla v3.4s, v17.4s, v13.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v8.4s, v12.4s, v7.4s\n"
-    "ldr x23, [%[inptrs], 128]\n"
-    "fmla v2.4s, v16.4s, v13.4s\n"
-    "ldr s19, [x20, x27]\n"
-    "fmov v12.4s, #6.0\n"
-    "ldr s17, [x23, x27]\n"
-    "fmla v3.4s, v15.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 96]\n"
-    "fmla v8.4s, v15.4s, v6.4s\n"
-    "ldr x22, [%[inptrs], 64]\n"
-    "fmla v2.4s, v18.4s, v10.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v14.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v3.4s, v19.4s, v9.4s\n"
-    "ldr x21, [%[inptrs], 32]\n"
-    "fmla v8.4s, v16.4s, v0.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v2.4s, v15.4s, v9.4s\n"
-    "ldr s19, [x21, x27]\n"
-    "ldr s16, [x20, x27]\n"
-    "ldr x23, [%[inptrs], 136]\n"
-    "fmla v3.4s, v17.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 104]\n"
-    "fmla v8.4s, v15.4s, v1.4s\n"
-    "ldr s14, [x23, x27]\n"
-    "fmla v2.4s, v18.4s, v5.4s\n"
-    "ldr s17, [x20, x27]\n"
-    "fmla v4.4s, v14.4s, v13.4s\n"
-    "ldr x22, [%[inptrs], 72]\n"
-    "fmla v3.4s, v15.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmax v8.4s, v8.4s, v11.4s\n"
-    "ldr s18, [x22, x27]\n"
-    "fmla v2.4s, v19.4s, v7.4s\n"
-    "ldr s13, [x20, x27]\n"
-    "fmla v4.4s, v17.4s, v10.4s\n"
-    "ldr x23, [%[inptrs], 144]\n"
-    "fmla v3.4s, v16.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 112]\n"
-    "fmin v8.4s, v8.4s, v12.4s\n"
-    "ldr s10, [x23, x27]\n"
-    "fmla v2.4s, v17.4s, v6.4s\n"
-    "ldr s15, [x20, x27]\n"
-    "fmla v4.4s, v13.4s, v9.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v3.4s, v14.4s, v0.4s\n"
-    "ldr x23, [%[inptrs], 152]\n"
-    "ldr s9, [x20, x27]\n"
-    "ldr x22, [%[outptrs], 0]\n"
-    "fmla v2.4s, v18.4s, v0.4s\n"
-    "ldr s19, [x23, x27]\n"
-    "str s8, [x22, x28]\n"
-    "fmla v4.4s, v10.4s, v5.4s\n"
-    "fmla v3.4s, v13.4s, v1.4s\n"
-    "ldr x20, [%[inptrs], 192]\n"
-    "ldr x22, [%[outptrs], 8]\n"
-    "ldr x24, [%[outptrs], 16]\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v15.4s, v1.4s\n"
-    "ldr s16, [x20, x27]\n"
-    "fmla v4.4s, v15.4s, v7.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmax v3.4s, v3.4s, v11.4s\n"
-    "add x27, x27, #4\n"
-    "fmax v2.4s, v2.4s, v11.4s\n"
-    "fmla v4.4s, v9.4s, v6.4s\n"
-    "fmin v3.4s, v3.4s, v12.4s\n"
-    "fmin v2.4s, v2.4s, v12.4s\n"
-    "str s3, [x24, x28]\n"
-    "fmla v4.4s, v19.4s, v0.4s\n"
-    "str s2, [x22, x28]\n"
-    "ldr x24, [%[outptrs], 24]\n"
-    "fmla v4.4s, v16.4s, v1.4s\n"
-    "fmax v4.4s, v4.4s, v11.4s\n"
-    "fmin v4.4s, v4.4s, v12.4s\n"
-    "str s4, [x24, x28]\n"
-    "add x28, x28, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 2142c43..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ /dev/null

@@ -1,2341 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x20, %[inptr0], %[input_row_stride]\n"
-    "add x13, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x24, %[outptr0], %[output_row_stride]\n"
-    "add x21, x20, %[input_row_stride]\n"
-    "add x14, x13, #64\n"
-    "add x15, x13, %[input_col_stride1]\n"
-    "add x22, x21, %[input_row_stride]\n"
-    "add x16, x15, #64\n"
-    "add x17, x15, %[input_col_stride1]\n"
-    "add x23, x22, %[input_row_stride]\n"
-    "add x9, x17, #64\n"
-    "add x25, x24, %[output_row_stride]\n"
-    "add x26, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x27, %[n_channels], #3\n"
-    "lsr x28, %[n_channels], #2\n"
-    "cbz x28, 4f\n"
-    "1:\n"
-    "ldr q25, [%[wbptr]]\n"
-    "subs x28, x28, #1\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr q16, [%[wbptr], #16]\n"
-    "mov v13.16b, v25.16b\n"
-    "ldr q7, [%[wbptr], #32]\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr q6, [%[wbptr], #48]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "mov v12.16b, v25.16b\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "mov v14.16b, v25.16b\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "mov v9.16b, v25.16b\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "mov v11.16b, v25.16b\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "mov v8.16b, v25.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "ldr q26, [%[inptr0]]\n"
-    "ldr q28, [x20]\n"
-    "fmla v17.4s, v26.4s, v16.4s\n"
-    "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v13.4s, v28.4s, v16.4s\n"
-    "ldr q27, [x21]\n"
-    "fmla v15.4s, v29.4s, v16.4s\n"
-    "ldr q21, [x20, %[input_col_stride1]]\n"
-    "fmla v17.4s, v28.4s, v5.4s\n"
-    "ldr q20, [%[inptr0], x13]\n"
-    "ldr q23, [x22]\n"
-    "ldr q19, [x21, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v17.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], x19]\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "prfm pldl1keep, [x20, x19]\n"
-    "prfm pldl1keep, [%[inptr0], x14]\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "prfm pldl1keep, [x21, x19]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "ldr q30, [x20, x13]\n"
-    "fmla v13.4s, v27.4s, v5.4s\n"
-    "ldr q29, [%[inptr0], x15]\n"
-    "fmla v10.4s, v27.4s, v16.4s\n"
-    "ldr q28, [x23]\n"
-    "fmla v17.4s, v21.4s, v4.4s\n"
-    "ldr q24, [x22, %[input_col_stride1]]\n"
-    "fmla v13.4s, v21.4s, v7.4s\n"
-    "ldr q18, [x21, x13]\n"
-    "fmla v15.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [x20, x14]\n"
-    "fmla v12.4s, v21.4s, v16.4s\n"
-    "ldr q22, [x20, x15]\n"
-    "fmla v17.4s, v20.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v15.4s, v20.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v14.4s, v20.4s, v16.4s\n"
-    "ldr q25, [%[inptr0], x17]\n"
-    "fmla v13.4s, v23.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x19]\n"
-    "fmla v10.4s, v23.4s, v5.4s\n"
-    "ldr q26, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v19.4s, v1.4s\n"
-    "prfm pldl1keep, [x21, x14]\n"
-    "fmla v13.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v15.4s, v19.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x9]\n"
-    "fmla v10.4s, v19.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, x19]\n"
-    "fmla v12.4s, v19.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, x14]\n"
-    "fmla v9.4s, v19.4s, v16.4s\n"
-    "ldr q27, [x22, x13]\n"
-    "fmla v17.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v13.4s, v30.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x9]\n"
-    "fmla v15.4s, v30.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x14]\n"
-    "fmla v12.4s, v30.4s, v7.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x21, x9]\n"
-    "fmla v11.4s, v30.4s, v16.4s\n"
-    "ldr q21, [x21, x15]\n"
-    "fmla v15.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr q20, [x20, x17]\n"
-    "fmla v10.4s, v28.4s, v2.4s\n"
-    "ldr q19, [x23, x13]\n"
-    "fmla v13.4s, v24.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x9]\n"
-    "fmla v12.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x9]\n"
-    "fmla v10.4s, v24.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v9.4s, v24.4s, v5.4s\n"
-    "ldr q23, [x22, x15]\n"
-    "fmla v17.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v13.4s, v18.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v15.4s, v18.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v10.4s, v18.4s, v6.4s\n"
-    "fmla v12.4s, v18.4s, v4.4s\n"
-    "ldr q17, [x21, x17]\n"
-    "fmla v14.4s, v18.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x19]\n"
-    "fmla v9.4s, v18.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], x14]\n"
-    "fmla v11.4s, v18.4s, v5.4s\n"
-    "add x20, x20, #16\n"
-    "fmla v8.4s, v18.4s, v16.4s\n"
-    "ldr q24, [x23, x15]\n"
-    "fmla v15.4s, v22.4s, v3.4s\n"
-    "ldr q18, [x22, x17]\n"
-    "fmla v12.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v14.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x19]\n"
-    "fmla v11.4s, v22.4s, v7.4s\n"
-    "ldr q22, [x23, x17]\n"
-    "fmla v10.4s, v26.4s, v1.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v14.4s, v25.4s, v6.4s\n"
-    "ldr q25, [%[wbptr]]\n"
-    "fmla v9.4s, v26.4s, v2.4s\n"
-    "ldr q16, [%[wbptr], #16]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "fmla v10.4s, v27.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x19]\n"
-    "fmla v12.4s, v27.4s, v1.4s\n"
-    "add x22, x22, #16\n"
-    "str q13, [x24]\n"
-    "fmla v9.4s, v27.4s, v4.4s\n"
-    "fmla v11.4s, v27.4s, v2.4s\n"
-    "ldr q26, [%[inptr0]]\n"
-    "fmla v8.4s, v27.4s, v5.4s\n"
-    "ldr q28, [x20]\n"
-    "fmla v15.4s, v21.4s, v0.4s\n"
-    "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v14.4s, v21.4s, v1.4s\n"
-    "add x23, x23, #16\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "ldr q5, [%[wbptr], #64]\n"
-    "fmla v8.4s, v21.4s, v7.4s\n"
-    "ldr q27, [x21]\n"
-    "fmla v14.4s, v20.4s, v3.4s\n"
-    "ldr q21, [x20, %[input_col_stride1]]\n"
-    "fmla v11.4s, v20.4s, v6.4s\n"
-    "ldr q20, [%[inptr0], x13]\n"
-    "fmla v10.4s, v19.4s, v0.4s\n"
-    "subs x28, x28, #1\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v8.4s, v19.4s, v2.4s\n"
-    "fmla v12.4s, v23.4s, v0.4s\n"
-    "ldr q7, [%[wbptr], #32]\n"
-    "str q10, [x25]\n"
-    "fmla v11.4s, v23.4s, v1.4s\n"
-    "fmla v9.4s, v23.4s, v3.4s\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "str q12, [x24, %[output_col_stride1]]\n"
-    "fmla v8.4s, v23.4s, v4.4s\n"
-    "fmla v14.4s, v17.4s, v0.4s\n"
-    "ldr q23, [x22]\n"
-    "fmla v11.4s, v17.4s, v3.4s\n"
-    "ldr q19, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v17.4s, v6.4s\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "str q14, [%[outptr0], x26]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "fmla v11.4s, v18.4s, v0.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v8.4s, v24.4s, v1.4s\n"
-    "ldr q6, [%[wbptr], #48]\n"
-    "str q9, [x25, %[output_col_stride1]]\n"
-    "mov v17.16b, v25.16b\n"
-    "str q11, [x24, x26]\n"
-    "mov v13.16b, v25.16b\n"
-    "fmla v8.4s, v18.4s, v3.4s\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "mov v15.16b, v25.16b\n"
-    "add x24, x24, #16\n"
-    "mov v10.16b, v25.16b\n"
-    "mov v12.16b, v25.16b\n"
-    "fmla v8.4s, v22.4s, v0.4s\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "mov v14.16b, v25.16b\n"
-    "mov v9.16b, v25.16b\n"
-    "mov v11.16b, v25.16b\n"
-    "fmla v17.4s, v26.4s, v16.4s\n"
-    "str q8, [x25, x26]\n"
-    "fmla v13.4s, v28.4s, v16.4s\n"
-    "mov v8.16b, v25.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "fmla v17.4s, v28.4s, v5.4s\n"
-    "fmla v15.4s, v29.4s, v16.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v17.4s, v29.4s, v7.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "ldr q30, [x20, x13]\n"
-    "fmla v13.4s, v27.4s, v5.4s\n"
-    "ldr q29, [%[inptr0], x15]\n"
-    "fmla v10.4s, v27.4s, v16.4s\n"
-    "ldr q28, [x23]\n"
-    "fmla v17.4s, v21.4s, v4.4s\n"
-    "ldr q24, [x22, %[input_col_stride1]]\n"
-    "fmla v13.4s, v21.4s, v7.4s\n"
-    "ldr q18, [x21, x13]\n"
-    "fmla v15.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [x20, x14]\n"
-    "fmla v12.4s, v21.4s, v16.4s\n"
-    "ldr q22, [x20, x15]\n"
-    "fmla v17.4s, v20.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v15.4s, v20.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v14.4s, v20.4s, v16.4s\n"
-    "ldr q25, [%[inptr0], x17]\n"
-    "fmla v13.4s, v23.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x19]\n"
-    "fmla v10.4s, v23.4s, v5.4s\n"
-    "ldr q26, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v19.4s, v1.4s\n"
-    "prfm pldl1keep, [x21, x14]\n"
-    "fmla v13.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v15.4s, v19.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x9]\n"
-    "fmla v10.4s, v19.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, x19]\n"
-    "fmla v12.4s, v19.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, x14]\n"
-    "fmla v9.4s, v19.4s, v16.4s\n"
-    "ldr q27, [x22, x13]\n"
-    "fmla v17.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v13.4s, v30.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x9]\n"
-    "fmla v15.4s, v30.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x14]\n"
-    "fmla v12.4s, v30.4s, v7.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x21, x9]\n"
-    "fmla v11.4s, v30.4s, v16.4s\n"
-    "ldr q21, [x21, x15]\n"
-    "fmla v15.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr q20, [x20, x17]\n"
-    "fmla v10.4s, v28.4s, v2.4s\n"
-    "ldr q19, [x23, x13]\n"
-    "fmla v13.4s, v24.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x9]\n"
-    "fmla v12.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x9]\n"
-    "fmla v10.4s, v24.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v9.4s, v24.4s, v5.4s\n"
-    "ldr q23, [x22, x15]\n"
-    "fmla v17.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v13.4s, v18.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v15.4s, v18.4s, v1.4s\n"
-    "add x20, x20, #16\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v10.4s, v18.4s, v6.4s\n"
-    "fmla v12.4s, v18.4s, v4.4s\n"
-    "ldr q17, [x21, x17]\n"
-    "fmla v14.4s, v18.4s, v2.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v9.4s, v18.4s, v7.4s\n"
-    "fmla v11.4s, v18.4s, v5.4s\n"
-    "fmla v8.4s, v18.4s, v16.4s\n"
-    "ldr q24, [x23, x15]\n"
-    "fmla v15.4s, v22.4s, v3.4s\n"
-    "ldr q18, [x22, x17]\n"
-    "fmla v12.4s, v22.4s, v6.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v14.4s, v22.4s, v4.4s\n"
-    "fmla v11.4s, v22.4s, v7.4s\n"
-    "fmla v10.4s, v26.4s, v1.4s\n"
-    "ldr q22, [x23, x17]\n"
-    "fmla v9.4s, v26.4s, v2.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v14.4s, v25.4s, v6.4s\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "fmla v10.4s, v27.4s, v3.4s\n"
-    "fmla v12.4s, v27.4s, v1.4s\n"
-    "fmla v9.4s, v27.4s, v4.4s\n"
-    "fmla v11.4s, v27.4s, v2.4s\n"
-    "str q13, [x24]\n"
-    "fmla v8.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v21.4s, v0.4s\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "fmla v14.4s, v21.4s, v1.4s\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "fmla v8.4s, v21.4s, v7.4s\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v19.4s, v0.4s\n"
-    "fmla v14.4s, v20.4s, v3.4s\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v11.4s, v20.4s, v6.4s\n"
-    "fmla v8.4s, v19.4s, v2.4s\n"
-    "str q10, [x25]\n"
-    "fmla v12.4s, v23.4s, v0.4s\n"
-    "fmla v9.4s, v23.4s, v3.4s\n"
-    "fmla v14.4s, v17.4s, v0.4s\n"
-    "fmla v11.4s, v23.4s, v1.4s\n"
-    "fmla v8.4s, v23.4s, v4.4s\n"
-    "str q12, [x24, %[output_col_stride1]]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "str q14, [%[outptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v3.4s\n"
-    "fmla v8.4s, v17.4s, v6.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q9, [x25, %[output_col_stride1]]\n"
-    "fmla v11.4s, v18.4s, v0.4s\n"
-    "fmla v8.4s, v24.4s, v1.4s\n"
-    "str q11, [x24, x26]\n"
-    "fmla v8.4s, v18.4s, v3.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v8.4s, v22.4s, v0.4s\n"
-    "str q8, [x25, x26]\n"
-    "add x25, x25, #16\n"
-    "4:\n"
-    "cbz x27, 7f\n"
-    "ldr s25, [%[wbptr]]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr s16, [%[wbptr], #4]\n"
-    "mov v13.16b, v25.16b\n"
-    "ldr s7, [%[wbptr], #8]\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr s6, [%[wbptr], #12]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "mov v12.16b, v25.16b\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "mov v14.16b, v25.16b\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "mov v9.16b, v25.16b\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "mov v11.16b, v25.16b\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "mov v8.16b, v25.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "ldr s26, [%[inptr0]]\n"
-    "subs x27, x27, #1\n"
-    "fmla v17.4s, v26.4s, v16.4s\n"
-    "ldr s28, [x20]\n"
-    "fmla v13.4s, v28.4s, v16.4s\n"
-    "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v15.4s, v29.4s, v16.4s\n"
-    "ldr s27, [x21]\n"
-    "fmla v17.4s, v28.4s, v5.4s\n"
-    "ldr s21, [x20, %[input_col_stride1]]\n"
-    "ldr s20, [%[inptr0], x13]\n"
-    "ldr s23, [x22]\n"
-    "ldr s19, [x21, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v17.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x19]\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "prfm pldl1keep, [x20, x19]\n"
-    "prfm pldl1keep, [%[inptr0], x14]\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "prfm pldl1keep, [x21, x19]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "ldr s30, [x20, x13]\n"
-    "fmla v13.4s, v27.4s, v5.4s\n"
-    "ldr s29, [%[inptr0], x15]\n"
-    "fmla v10.4s, v27.4s, v16.4s\n"
-    "ldr s28, [x23]\n"
-    "fmla v17.4s, v21.4s, v4.4s\n"
-    "ldr s24, [x22, %[input_col_stride1]]\n"
-    "fmla v13.4s, v21.4s, v7.4s\n"
-    "ldr s18, [x21, x13]\n"
-    "fmla v15.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [x20, x14]\n"
-    "fmla v12.4s, v21.4s, v16.4s\n"
-    "ldr s22, [x20, x15]\n"
-    "fmla v17.4s, v20.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v15.4s, v20.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v14.4s, v20.4s, v16.4s\n"
-    "ldr s25, [%[inptr0], x17]\n"
-    "fmla v13.4s, v23.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x19]\n"
-    "fmla v10.4s, v23.4s, v5.4s\n"
-    "ldr s26, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v19.4s, v1.4s\n"
-    "prfm pldl1keep, [x21, x14]\n"
-    "fmla v13.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v15.4s, v19.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x9]\n"
-    "fmla v10.4s, v19.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, x19]\n"
-    "fmla v12.4s, v19.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, x14]\n"
-    "fmla v9.4s, v19.4s, v16.4s\n"
-    "ldr s27, [x22, x13]\n"
-    "fmla v17.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v13.4s, v30.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x9]\n"
-    "fmla v15.4s, v30.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x14]\n"
-    "fmla v12.4s, v30.4s, v7.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x21, x9]\n"
-    "fmla v11.4s, v30.4s, v16.4s\n"
-    "ldr s21, [x21, x15]\n"
-    "fmla v15.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr s20, [x20, x17]\n"
-    "fmla v10.4s, v28.4s, v2.4s\n"
-    "ldr s19, [x23, x13]\n"
-    "fmla v13.4s, v24.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x9]\n"
-    "fmla v12.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x9]\n"
-    "fmla v10.4s, v24.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v9.4s, v24.4s, v5.4s\n"
-    "ldr s23, [x22, x15]\n"
-    "fmla v17.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v13.4s, v18.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v15.4s, v18.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v10.4s, v18.4s, v6.4s\n"
-    "fmla v12.4s, v18.4s, v4.4s\n"
-    "ldr s17, [x21, x17]\n"
-    "fmla v14.4s, v18.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x19]\n"
-    "fmla v9.4s, v18.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], x14]\n"
-    "fmla v11.4s, v18.4s, v5.4s\n"
-    "add x20, x20, #4\n"
-    "fmla v8.4s, v18.4s, v16.4s\n"
-    "ldr s24, [x23, x15]\n"
-    "fmla v15.4s, v22.4s, v3.4s\n"
-    "ldr s18, [x22, x17]\n"
-    "fmla v12.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v14.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x19]\n"
-    "fmla v11.4s, v22.4s, v7.4s\n"
-    "ldr s22, [x23, x17]\n"
-    "fmla v10.4s, v26.4s, v1.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v14.4s, v25.4s, v6.4s\n"
-    "ldr s25, [%[wbptr]]\n"
-    "fmla v9.4s, v26.4s, v2.4s\n"
-    "ldr s16, [%[wbptr], #4]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "fmla v10.4s, v27.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x19]\n"
-    "fmla v12.4s, v27.4s, v1.4s\n"
-    "add x22, x22, #4\n"
-    "str s13, [x24]\n"
-    "fmla v9.4s, v27.4s, v4.4s\n"
-    "fmla v11.4s, v27.4s, v2.4s\n"
-    "ldr s26, [%[inptr0]]\n"
-    "fmla v8.4s, v27.4s, v5.4s\n"
-    "ldr s28, [x20]\n"
-    "fmla v15.4s, v21.4s, v0.4s\n"
-    "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v14.4s, v21.4s, v1.4s\n"
-    "add x23, x23, #4\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "ldr s5, [%[wbptr], #16]\n"
-    "fmla v8.4s, v21.4s, v7.4s\n"
-    "ldr s27, [x21]\n"
-    "fmla v14.4s, v20.4s, v3.4s\n"
-    "ldr s21, [x20, %[input_col_stride1]]\n"
-    "fmla v11.4s, v20.4s, v6.4s\n"
-    "ldr s20, [%[inptr0], x13]\n"
-    "fmla v10.4s, v19.4s, v0.4s\n"
-    "subs x27, x27, #1\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v8.4s, v19.4s, v2.4s\n"
-    "fmla v12.4s, v23.4s, v0.4s\n"
-    "ldr s7, [%[wbptr], #8]\n"
-    "str s10, [x25]\n"
-    "fmla v11.4s, v23.4s, v1.4s\n"
-    "fmla v9.4s, v23.4s, v3.4s\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "str s12, [x24, %[output_col_stride1]]\n"
-    "fmla v8.4s, v23.4s, v4.4s\n"
-    "fmla v14.4s, v17.4s, v0.4s\n"
-    "ldr s23, [x22]\n"
-    "fmla v11.4s, v17.4s, v3.4s\n"
-    "ldr s19, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v17.4s, v6.4s\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "str s14, [%[outptr0], x26]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "fmla v11.4s, v18.4s, v0.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v8.4s, v24.4s, v1.4s\n"
-    "ldr s6, [%[wbptr], #12]\n"
-    "str s9, [x25, %[output_col_stride1]]\n"
-    "mov v17.16b, v25.16b\n"
-    "str s11, [x24, x26]\n"
-    "mov v13.16b, v25.16b\n"
-    "fmla v8.4s, v18.4s, v3.4s\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "mov v15.16b, v25.16b\n"
-    "add x24, x24, #4\n"
-    "mov v10.16b, v25.16b\n"
-    "mov v12.16b, v25.16b\n"
-    "fmla v8.4s, v22.4s, v0.4s\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "mov v14.16b, v25.16b\n"
-    "mov v9.16b, v25.16b\n"
-    "mov v11.16b, v25.16b\n"
-    "fmla v17.4s, v26.4s, v16.4s\n"
-    "str s8, [x25, x26]\n"
-    "fmla v13.4s, v28.4s, v16.4s\n"
-    "mov v8.16b, v25.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "fmla v17.4s, v28.4s, v5.4s\n"
-    "fmla v15.4s, v29.4s, v16.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v17.4s, v29.4s, v7.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "ldr s30, [x20, x13]\n"
-    "fmla v13.4s, v27.4s, v5.4s\n"
-    "ldr s29, [%[inptr0], x15]\n"
-    "fmla v10.4s, v27.4s, v16.4s\n"
-    "ldr s28, [x23]\n"
-    "fmla v17.4s, v21.4s, v4.4s\n"
-    "ldr s24, [x22, %[input_col_stride1]]\n"
-    "fmla v13.4s, v21.4s, v7.4s\n"
-    "ldr s18, [x21, x13]\n"
-    "fmla v15.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [x20, x14]\n"
-    "fmla v12.4s, v21.4s, v16.4s\n"
-    "ldr s22, [x20, x15]\n"
-    "fmla v17.4s, v20.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v15.4s, v20.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v14.4s, v20.4s, v16.4s\n"
-    "ldr s25, [%[inptr0], x17]\n"
-    "fmla v13.4s, v23.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x19]\n"
-    "fmla v10.4s, v23.4s, v5.4s\n"
-    "ldr s26, [x23, %[input_col_stride1]]\n"
-    "fmla v17.4s, v19.4s, v1.4s\n"
-    "prfm pldl1keep, [x21, x14]\n"
-    "fmla v13.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v15.4s, v19.4s, v2.4s\n"
-    "prfm pldl1keep, [%[inptr0], x9]\n"
-    "fmla v10.4s, v19.4s, v7.4s\n"
-    "prfm pldl1keep, [x23, x19]\n"
-    "fmla v12.4s, v19.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, x14]\n"
-    "fmla v9.4s, v19.4s, v16.4s\n"
-    "ldr s27, [x22, x13]\n"
-    "fmla v17.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v13.4s, v30.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x9]\n"
-    "fmla v15.4s, v30.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x14]\n"
-    "fmla v12.4s, v30.4s, v7.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x21, x9]\n"
-    "fmla v11.4s, v30.4s, v16.4s\n"
-    "ldr s21, [x21, x15]\n"
-    "fmla v15.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr s20, [x20, x17]\n"
-    "fmla v10.4s, v28.4s, v2.4s\n"
-    "ldr s19, [x23, x13]\n"
-    "fmla v13.4s, v24.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x9]\n"
-    "fmla v12.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x9]\n"
-    "fmla v10.4s, v24.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v9.4s, v24.4s, v5.4s\n"
-    "ldr s23, [x22, x15]\n"
-    "fmla v17.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v13.4s, v18.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v15.4s, v18.4s, v1.4s\n"
-    "add x20, x20, #4\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v10.4s, v18.4s, v6.4s\n"
-    "fmla v12.4s, v18.4s, v4.4s\n"
-    "ldr s17, [x21, x17]\n"
-    "fmla v14.4s, v18.4s, v2.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v9.4s, v18.4s, v7.4s\n"
-    "fmla v11.4s, v18.4s, v5.4s\n"
-    "fmla v8.4s, v18.4s, v16.4s\n"
-    "ldr s24, [x23, x15]\n"
-    "fmla v15.4s, v22.4s, v3.4s\n"
-    "ldr s18, [x22, x17]\n"
-    "fmla v12.4s, v22.4s, v6.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v14.4s, v22.4s, v4.4s\n"
-    "fmla v11.4s, v22.4s, v7.4s\n"
-    "fmla v10.4s, v26.4s, v1.4s\n"
-    "ldr s22, [x23, x17]\n"
-    "fmla v9.4s, v26.4s, v2.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v14.4s, v25.4s, v6.4s\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "fmla v10.4s, v27.4s, v3.4s\n"
-    "fmla v12.4s, v27.4s, v1.4s\n"
-    "fmla v9.4s, v27.4s, v4.4s\n"
-    "fmla v11.4s, v27.4s, v2.4s\n"
-    "str s13, [x24]\n"
-    "fmla v8.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v21.4s, v0.4s\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "fmla v14.4s, v21.4s, v1.4s\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "fmla v8.4s, v21.4s, v7.4s\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v10.4s, v19.4s, v0.4s\n"
-    "fmla v14.4s, v20.4s, v3.4s\n"
-    "fmla v9.4s, v19.4s, v1.4s\n"
-    "fmla v11.4s, v20.4s, v6.4s\n"
-    "fmla v8.4s, v19.4s, v2.4s\n"
-    "str s10, [x25]\n"
-    "fmla v12.4s, v23.4s, v0.4s\n"
-    "fmla v9.4s, v23.4s, v3.4s\n"
-    "fmla v14.4s, v17.4s, v0.4s\n"
-    "fmla v11.4s, v23.4s, v1.4s\n"
-    "fmla v8.4s, v23.4s, v4.4s\n"
-    "str s12, [x24, %[output_col_stride1]]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "str s14, [%[outptr0], x26]\n"
-    "fmla v11.4s, v17.4s, v3.4s\n"
-    "fmla v8.4s, v17.4s, v6.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s9, [x25, %[output_col_stride1]]\n"
-    "fmla v11.4s, v18.4s, v0.4s\n"
-    "fmla v8.4s, v24.4s, v1.4s\n"
-    "str s11, [x24, x26]\n"
-    "fmla v8.4s, v18.4s, v3.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v8.4s, v22.4s, v0.4s\n"
-    "str s8, [x25, x26]\n"
-    "add x25, x25, #4\n"
-    "7:\n"
-    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x25, %[inptr0], %[input_row_stride]\n"
-    "add x16, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x21, %[outptr0], %[output_row_stride]\n"
-    "add x22, x25, %[input_row_stride]\n"
-    "add x23, x16, #64\n"
-    "add x26, x16, %[input_col_stride1]\n"
-    "add x13, x22, %[input_row_stride]\n"
-    "add x20, x26, #64\n"
-    "add x9, x26, %[input_col_stride1]\n"
-    "add x24, x13, %[input_row_stride]\n"
-    "add x15, x9, #64\n"
-    "add x14, x21, %[output_row_stride]\n"
-    "add x19, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x27, %[n_channels], #3\n"
-    "lsr x28, %[n_channels], #2\n"
-    "cbz x28, 4f\n"
-    "1:\n"
-    "ldr q20, [%[wbptr]]\n"
-    "subs x28, x28, #1\n"
-    "mov v4.16b, v20.16b\n"
-    "ldr q15, [%[wbptr], #16]\n"
-    "mov v1.16b, v20.16b\n"
-    "ldr q0, [%[wbptr], #32]\n"
-    "mov v3.16b, v20.16b\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "mov v7.16b, v20.16b\n"
-    "ldr q16, [%[wbptr], #64]\n"
-    "mov v9.16b, v20.16b\n"
-    "ldr q12, [%[wbptr], #80]\n"
-    "mov v2.16b, v20.16b\n"
-    "ldr q17, [%[wbptr], #96]\n"
-    "mov v6.16b, v20.16b\n"
-    "ldr q11, [%[wbptr], #112]\n"
-    "mov v8.16b, v20.16b\n"
-    "ldr q10, [%[wbptr], #128]\n"
-    "mov v5.16b, v20.16b\n"
-    "ldr q14, [%[wbptr], #144]\n"
-    "ldr q27, [%[inptr0]]\n"
-    "ldr q24, [x25]\n"
-    "fmla v4.4s, v27.4s, v15.4s\n"
-    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q21, [x22]\n"
-    "ldr q19, [x25, %[input_col_stride1]]\n"
-    "ldr q31, [%[inptr0], x16]\n"
-    "ldr q28, [x13]\n"
-    "fmla v4.4s, v24.4s, v16.4s\n"
-    "ldr q18, [x22, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x25, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x17]\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "prfm pldl1keep, [x25, x17]\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "prfm pldl1keep, [x13, #64]\n"
-    "prfm pldl1keep, [x22, x17]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v1.4s, v24.4s, v15.4s\n"
-    "ldr q24, [x25, x16]\n"
-    "fmla v4.4s, v22.4s, v0.4s\n"
-    "ldr q29, [%[inptr0], x26]\n"
-    "fmla v3.4s, v22.4s, v15.4s\n"
-    "ldr q30, [x24]\n"
-    "fmla v1.4s, v21.4s, v16.4s\n"
-    "ldr q25, [x13, %[input_col_stride1]]\n"
-    "fmla v4.4s, v21.4s, v11.4s\n"
-    "prfm pldl1keep, [x25, x23]\n"
-    "fmla v7.4s, v21.4s, v15.4s\n"
-    "ldr q26, [x22, x16]\n"
-    "fmla v1.4s, v19.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v4.4s, v19.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v3.4s, v19.4s, v16.4s\n"
-    "prfm pldl1keep, [x13, x17]\n"
-    "fmla v9.4s, v19.4s, v15.4s\n"
-    "ldr q23, [x25, x26]\n"
-    "fmla v4.4s, v31.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x23]\n"
-    "fmla v3.4s, v31.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x20]\n"
-    "fmla v2.4s, v31.4s, v15.4s\n"
-    "ldr q20, [%[inptr0], x9]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "ldr q28, [x24, %[input_col_stride1]]\n"
-    "fmla v4.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x17]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "prfm pldl1keep, [x13, x23]\n"
-    "fmla v3.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x22, x20]\n"
-    "fmla v7.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x15]\n"
-    "fmla v9.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x23]\n"
-    "fmla v6.4s, v18.4s, v15.4s\n"
-    "ldr q27, [x13, x16]\n"
-    "fmla v4.4s, v24.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, x20]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x15]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, x20]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "prfm pldl1keep, [x13, x15]\n"
-    "fmla v2.4s, v24.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v8.4s, v24.4s, v15.4s\n"
-    "ldr q24, [x22, x26]\n"
-    "fmla v3.4s, v29.4s, v13.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v29.4s, v0.4s\n"
-    "ldr q22, [x25, x9]\n"
-    "fmla v7.4s, v30.4s, v11.4s\n"
-    "ldr q21, [x24, x16]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v25.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v7.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v6.4s, v25.4s, v16.4s\n"
-    "ldr q19, [x13, x26]\n"
-    "fmla v4.4s, v26.4s, v14.4s\n"
-    "prfm pldl1keep, [%[inptr0], x17]\n"
-    "fmla v1.4s, v26.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v3.4s, v26.4s, v10.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v7.4s, v26.4s, v13.4s\n"
-    "prfm pldl1keep, [x25, #64]\n"
-    "fmla v9.4s, v26.4s, v12.4s\n"
-    "prfm pldl1keep, [x25, x17]\n"
-    "fmla v2.4s, v26.4s, v11.4s\n"
-    "subs x28, x28, #1\n"
-    "fmla v6.4s, v26.4s, v0.4s\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "fmla v5.4s, v26.4s, v15.4s\n"
-    "ldr q26, [x22, x9]\n"
-    "fmla v3.4s, v23.4s, v17.4s\n"
-    "ldr q18, [x24, x26]\n"
-    "fmla v9.4s, v23.4s, v13.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v8.4s, v23.4s, v0.4s\n"
-    "ldr q23, [x13, x9]\n"
-    "fmla v7.4s, v28.4s, v10.4s\n"
-    "prfm pldl1keep, [x22, x17]\n"
-    "fmla v2.4s, v20.4s, v13.4s\n"
-    "ldr q25, [x24, x9]\n"
-    "fmla v6.4s, v28.4s, v11.4s\n"
-    "ldr q20, [%[wbptr]]\n"
-    "fmla v1.4s, v27.4s, v14.4s\n"
-    "add x13, x13, #16\n"
-    "fmla v7.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, #64]\n"
-    "fmla v9.4s, v27.4s, v10.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v6.4s, v27.4s, v12.4s\n"
-    "fmla v8.4s, v27.4s, v11.4s\n"
-    "fmla v5.4s, v27.4s, v16.4s\n"
-    "ldr q15, [%[wbptr], #16]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "ldr q27, [%[inptr0]]\n"
-    "fmla v9.4s, v24.4s, v17.4s\n"
-    "fmla v2.4s, v24.4s, v10.4s\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "fmla v8.4s, v24.4s, v12.4s\n"
-    "fmla v5.4s, v24.4s, v0.4s\n"
-    "ldr q16, [%[wbptr], #64]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "ldr q24, [x25]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v7.4s, v21.4s, v14.4s\n"
-    "fmla v6.4s, v21.4s, v10.4s\n"
-    "fmla v5.4s, v21.4s, v11.4s\n"
-    "ldr q0, [%[wbptr], #32]\n"
-    "fmla v9.4s, v19.4s, v14.4s\n"
-    "ldr q21, [x22]\n"
-    "fmla v6.4s, v19.4s, v17.4s\n"
-    "fmla v8.4s, v19.4s, v10.4s\n"
-    "fmla v5.4s, v19.4s, v12.4s\n"
-    "ldr q11, [%[wbptr], #112]\n"
-    "fmla v2.4s, v26.4s, v14.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v8.4s, v26.4s, v17.4s\n"
-    "fmla v6.4s, v18.4s, v14.4s\n"
-    "fmla v5.4s, v26.4s, v13.4s\n"
-    "ldr q12, [%[wbptr], #80]\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "ldr q19, [x25, %[input_col_stride1]]\n"
-    "fmla v8.4s, v23.4s, v14.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "str q4, [%[outptr0]]\n"
-    "fmla v5.4s, v18.4s, v10.4s\n"
-    "str q3, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "str q2, [%[outptr0], x19]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "str q1, [x21]\n"
-    "fmax v9.4s, v9.4s, v29.4s\n"
-    "fmax v8.4s, v8.4s, v29.4s\n"
-    "ldr q10, [%[wbptr], #128]\n"
-    "str q9, [x21, %[output_col_stride1]]\n"
-    "fmla v5.4s, v25.4s, v14.4s\n"
-    "str q8, [x21, x19]\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "ldr q17, [%[wbptr], #96]\n"
-    "str q7, [x14]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "str q6, [x14, %[output_col_stride1]]\n"
-    "mov v4.16b, v20.16b\n"
-    "str q5, [x14, x19]\n"
-    "mov v1.16b, v20.16b\n"
-    "mov v3.16b, v20.16b\n"
-    "ldr q14, [%[wbptr], #144]\n"
-    "mov v7.16b, v20.16b\n"
-    "ldr q31, [%[inptr0], x16]\n"
-    "mov v9.16b, v20.16b\n"
-    "ldr q28, [x13]\n"
-    "mov v2.16b, v20.16b\n"
-    "ldr q18, [x22, %[input_col_stride1]]\n"
-    "mov v6.16b, v20.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "mov v8.16b, v20.16b\n"
-    "add x21, x21, #16\n"
-    "mov v5.16b, v20.16b\n"
-    "add x14, x14, #16\n"
-    "fmla v4.4s, v27.4s, v15.4s\n"
-    "fmla v4.4s, v24.4s, v16.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v1.4s, v24.4s, v15.4s\n"
-    "ldr q24, [x25, x16]\n"
-    "fmla v4.4s, v22.4s, v0.4s\n"
-    "ldr q29, [%[inptr0], x26]\n"
-    "fmla v3.4s, v22.4s, v15.4s\n"
-    "ldr q30, [x24]\n"
-    "fmla v1.4s, v21.4s, v16.4s\n"
-    "ldr q25, [x13, %[input_col_stride1]]\n"
-    "fmla v4.4s, v21.4s, v11.4s\n"
-    "prfm pldl1keep, [x25, x23]\n"
-    "fmla v7.4s, v21.4s, v15.4s\n"
-    "ldr q26, [x22, x16]\n"
-    "fmla v1.4s, v19.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v4.4s, v19.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v3.4s, v19.4s, v16.4s\n"
-    "prfm pldl1keep, [x13, x17]\n"
-    "fmla v9.4s, v19.4s, v15.4s\n"
-    "ldr q23, [x25, x26]\n"
-    "fmla v4.4s, v31.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x23]\n"
-    "fmla v3.4s, v31.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x20]\n"
-    "fmla v2.4s, v31.4s, v15.4s\n"
-    "ldr q20, [%[inptr0], x9]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "ldr q28, [x24, %[input_col_stride1]]\n"
-    "fmla v4.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x17]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "prfm pldl1keep, [x13, x23]\n"
-    "fmla v3.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x22, x20]\n"
-    "fmla v7.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x15]\n"
-    "fmla v9.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x23]\n"
-    "fmla v6.4s, v18.4s, v15.4s\n"
-    "ldr q27, [x13, x16]\n"
-    "fmla v4.4s, v24.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, x20]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x15]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, x20]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "prfm pldl1keep, [x13, x15]\n"
-    "fmla v2.4s, v24.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v8.4s, v24.4s, v15.4s\n"
-    "ldr q24, [x22, x26]\n"
-    "fmla v3.4s, v29.4s, v13.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v2.4s, v29.4s, v0.4s\n"
-    "ldr q22, [x25, x9]\n"
-    "fmla v7.4s, v30.4s, v11.4s\n"
-    "ldr q21, [x24, x16]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v25.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v7.4s, v25.4s, v12.4s\n"
-    "add x25, x25, #16\n"
-    "fmla v6.4s, v25.4s, v16.4s\n"
-    "ldr q19, [x13, x26]\n"
-    "fmla v4.4s, v26.4s, v14.4s\n"
-    "fmla v1.4s, v26.4s, v17.4s\n"
-    "fmla v3.4s, v26.4s, v10.4s\n"
-    "fmla v7.4s, v26.4s, v13.4s\n"
-    "fmla v9.4s, v26.4s, v12.4s\n"
-    "fmla v2.4s, v26.4s, v11.4s\n"
-    "fmla v6.4s, v26.4s, v0.4s\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "fmla v5.4s, v26.4s, v15.4s\n"
-    "ldr q26, [x22, x9]\n"
-    "fmla v3.4s, v23.4s, v17.4s\n"
-    "ldr q18, [x24, x26]\n"
-    "fmla v9.4s, v23.4s, v13.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "fmla v8.4s, v23.4s, v0.4s\n"
-    "fmla v7.4s, v28.4s, v10.4s\n"
-    "ldr q23, [x13, x9]\n"
-    "fmla v6.4s, v28.4s, v11.4s\n"
-    "ldr q25, [x24, x9]\n"
-    "fmla v2.4s, v20.4s, v13.4s\n"
-    "add x13, x13, #16\n"
-    "fmla v1.4s, v27.4s, v14.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v7.4s, v27.4s, v17.4s\n"
-    "fmla v9.4s, v27.4s, v10.4s\n"
-    "fmla v6.4s, v27.4s, v12.4s\n"
-    "fmla v8.4s, v27.4s, v11.4s\n"
-    "fmla v5.4s, v27.4s, v16.4s\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "fmla v9.4s, v24.4s, v17.4s\n"
-    "fmla v2.4s, v24.4s, v10.4s\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "fmla v8.4s, v24.4s, v12.4s\n"
-    "fmla v5.4s, v24.4s, v0.4s\n"
-    "fmla v7.4s, v21.4s, v14.4s\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "fmla v9.4s, v19.4s, v14.4s\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "fmla v6.4s, v21.4s, v10.4s\n"
-    "fmla v5.4s, v21.4s, v11.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v2.4s, v26.4s, v14.4s\n"
-    "fmla v6.4s, v19.4s, v17.4s\n"
-    "fmla v8.4s, v19.4s, v10.4s\n"
-    "fmla v5.4s, v19.4s, v12.4s\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "fmla v6.4s, v18.4s, v14.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "str q4, [%[outptr0]]\n"
-    "fmla v8.4s, v26.4s, v17.4s\n"
-    "str q3, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v5.4s, v26.4s, v13.4s\n"
-    "str q2, [%[outptr0], x19]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "fmla v8.4s, v23.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q1, [x21]\n"
-    "fmla v5.4s, v18.4s, v10.4s\n"
-    "fmax v9.4s, v9.4s, v29.4s\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v8.4s, v8.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "str q9, [x21, %[output_col_stride1]]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "str q8, [x21, x19]\n"
-    "str q7, [x14]\n"
-    "str q6, [x14, %[output_col_stride1]]\n"
-    "add x21, x21, #16\n"
-    "fmla v5.4s, v25.4s, v14.4s\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "str q5, [x14, x19]\n"
-    "add x14, x14, #16\n"
-    "4:\n"
-    "cbz x27, 7f\n"
-    "ldr s20, [%[wbptr]]\n"
-    "mov v4.16b, v20.16b\n"
-    "ldr s15, [%[wbptr], #4]\n"
-    "mov v1.16b, v20.16b\n"
-    "ldr s0, [%[wbptr], #8]\n"
-    "mov v3.16b, v20.16b\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "mov v7.16b, v20.16b\n"
-    "ldr s16, [%[wbptr], #16]\n"
-    "mov v9.16b, v20.16b\n"
-    "ldr s12, [%[wbptr], #20]\n"
-    "mov v2.16b, v20.16b\n"
-    "ldr s17, [%[wbptr], #24]\n"
-    "mov v6.16b, v20.16b\n"
-    "ldr s11, [%[wbptr], #28]\n"
-    "mov v8.16b, v20.16b\n"
-    "ldr s10, [%[wbptr], #32]\n"
-    "mov v5.16b, v20.16b\n"
-    "ldr s14, [%[wbptr], #36]\n"
-    "ldr s27, [%[inptr0]]\n"
-    "subs x27, x27, #1\n"
-    "fmla v4.4s, v27.4s, v15.4s\n"
-    "ldr s24, [x25]\n"
-    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s21, [x22]\n"
-    "ldr s19, [x25, %[input_col_stride1]]\n"
-    "ldr s31, [%[inptr0], x16]\n"
-    "fmla v4.4s, v24.4s, v16.4s\n"
-    "ldr s28, [x13]\n"
-    "ldr s18, [x22, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x25, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x17]\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "prfm pldl1keep, [x25, x17]\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "prfm pldl1keep, [x13, #64]\n"
-    "prfm pldl1keep, [x22, x17]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v1.4s, v24.4s, v15.4s\n"
-    "ldr s24, [x25, x16]\n"
-    "fmla v4.4s, v22.4s, v0.4s\n"
-    "ldr s29, [%[inptr0], x26]\n"
-    "fmla v3.4s, v22.4s, v15.4s\n"
-    "ldr s30, [x24]\n"
-    "fmla v1.4s, v21.4s, v16.4s\n"
-    "ldr s25, [x13, %[input_col_stride1]]\n"
-    "fmla v4.4s, v21.4s, v11.4s\n"
-    "prfm pldl1keep, [x25, x23]\n"
-    "fmla v7.4s, v21.4s, v15.4s\n"
-    "ldr s26, [x22, x16]\n"
-    "fmla v1.4s, v19.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v4.4s, v19.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v3.4s, v19.4s, v16.4s\n"
-    "prfm pldl1keep, [x13, x17]\n"
-    "fmla v9.4s, v19.4s, v15.4s\n"
-    "ldr s23, [x25, x26]\n"
-    "fmla v4.4s, v31.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x23]\n"
-    "fmla v3.4s, v31.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x20]\n"
-    "fmla v2.4s, v31.4s, v15.4s\n"
-    "ldr s20, [%[inptr0], x9]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "ldr s28, [x24, %[input_col_stride1]]\n"
-    "fmla v4.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x17]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "prfm pldl1keep, [x13, x23]\n"
-    "fmla v3.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x22, x20]\n"
-    "fmla v7.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x15]\n"
-    "fmla v9.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x23]\n"
-    "fmla v6.4s, v18.4s, v15.4s\n"
-    "ldr s27, [x13, x16]\n"
-    "fmla v4.4s, v24.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, x20]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x15]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, x20]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "prfm pldl1keep, [x13, x15]\n"
-    "fmla v2.4s, v24.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v8.4s, v24.4s, v15.4s\n"
-    "ldr s24, [x22, x26]\n"
-    "fmla v3.4s, v29.4s, v13.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v29.4s, v0.4s\n"
-    "ldr s22, [x25, x9]\n"
-    "fmla v7.4s, v30.4s, v11.4s\n"
-    "ldr s21, [x24, x16]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v25.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v7.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v6.4s, v25.4s, v16.4s\n"
-    "ldr s19, [x13, x26]\n"
-    "fmla v4.4s, v26.4s, v14.4s\n"
-    "prfm pldl1keep, [%[inptr0], x17]\n"
-    "fmla v1.4s, v26.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v3.4s, v26.4s, v10.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v7.4s, v26.4s, v13.4s\n"
-    "prfm pldl1keep, [x25, #64]\n"
-    "fmla v9.4s, v26.4s, v12.4s\n"
-    "prfm pldl1keep, [x25, x17]\n"
-    "fmla v2.4s, v26.4s, v11.4s\n"
-    "subs x27, x27, #1\n"
-    "fmla v6.4s, v26.4s, v0.4s\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "fmla v5.4s, v26.4s, v15.4s\n"
-    "ldr s26, [x22, x9]\n"
-    "fmla v3.4s, v23.4s, v17.4s\n"
-    "ldr s18, [x24, x26]\n"
-    "fmla v9.4s, v23.4s, v13.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v8.4s, v23.4s, v0.4s\n"
-    "ldr s23, [x13, x9]\n"
-    "fmla v7.4s, v28.4s, v10.4s\n"
-    "prfm pldl1keep, [x22, x17]\n"
-    "fmla v2.4s, v20.4s, v13.4s\n"
-    "ldr s25, [x24, x9]\n"
-    "fmla v6.4s, v28.4s, v11.4s\n"
-    "ldr s20, [%[wbptr]]\n"
-    "fmla v1.4s, v27.4s, v14.4s\n"
-    "add x13, x13, #4\n"
-    "fmla v7.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, #64]\n"
-    "fmla v9.4s, v27.4s, v10.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v6.4s, v27.4s, v12.4s\n"
-    "fmla v8.4s, v27.4s, v11.4s\n"
-    "fmla v5.4s, v27.4s, v16.4s\n"
-    "ldr s15, [%[wbptr], #4]\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "ldr s27, [%[inptr0]]\n"
-    "fmla v9.4s, v24.4s, v17.4s\n"
-    "fmla v2.4s, v24.4s, v10.4s\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "fmla v8.4s, v24.4s, v12.4s\n"
-    "fmla v5.4s, v24.4s, v0.4s\n"
-    "ldr s16, [%[wbptr], #16]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "ldr s24, [x25]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v7.4s, v21.4s, v14.4s\n"
-    "fmla v6.4s, v21.4s, v10.4s\n"
-    "fmla v5.4s, v21.4s, v11.4s\n"
-    "ldr s0, [%[wbptr], #8]\n"
-    "fmla v9.4s, v19.4s, v14.4s\n"
-    "ldr s21, [x22]\n"
-    "fmla v6.4s, v19.4s, v17.4s\n"
-    "fmla v8.4s, v19.4s, v10.4s\n"
-    "fmla v5.4s, v19.4s, v12.4s\n"
-    "ldr s11, [%[wbptr], #28]\n"
-    "fmla v2.4s, v26.4s, v14.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v8.4s, v26.4s, v17.4s\n"
-    "fmla v6.4s, v18.4s, v14.4s\n"
-    "fmla v5.4s, v26.4s, v13.4s\n"
-    "ldr s12, [%[wbptr], #20]\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "ldr s19, [x25, %[input_col_stride1]]\n"
-    "fmla v8.4s, v23.4s, v14.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "str s4, [%[outptr0]]\n"
-    "fmla v5.4s, v18.4s, v10.4s\n"
-    "str s3, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "str s2, [%[outptr0], x19]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "str s1, [x21]\n"
-    "fmax v9.4s, v9.4s, v29.4s\n"
-    "fmax v8.4s, v8.4s, v29.4s\n"
-    "ldr s10, [%[wbptr], #32]\n"
-    "str s9, [x21, %[output_col_stride1]]\n"
-    "fmla v5.4s, v25.4s, v14.4s\n"
-    "str s8, [x21, x19]\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "ldr s17, [%[wbptr], #24]\n"
-    "str s7, [x14]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "str s6, [x14, %[output_col_stride1]]\n"
-    "mov v4.16b, v20.16b\n"
-    "str s5, [x14, x19]\n"
-    "mov v1.16b, v20.16b\n"
-    "mov v3.16b, v20.16b\n"
-    "ldr s14, [%[wbptr], #36]\n"
-    "mov v7.16b, v20.16b\n"
-    "ldr s31, [%[inptr0], x16]\n"
-    "mov v9.16b, v20.16b\n"
-    "ldr s28, [x13]\n"
-    "mov v2.16b, v20.16b\n"
-    "ldr s18, [x22, %[input_col_stride1]]\n"
-    "mov v6.16b, v20.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "mov v8.16b, v20.16b\n"
-    "add x21, x21, #4\n"
-    "mov v5.16b, v20.16b\n"
-    "add x14, x14, #4\n"
-    "fmla v4.4s, v27.4s, v15.4s\n"
-    "fmla v4.4s, v24.4s, v16.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v1.4s, v24.4s, v15.4s\n"
-    "ldr s24, [x25, x16]\n"
-    "fmla v4.4s, v22.4s, v0.4s\n"
-    "ldr s29, [%[inptr0], x26]\n"
-    "fmla v3.4s, v22.4s, v15.4s\n"
-    "ldr s30, [x24]\n"
-    "fmla v1.4s, v21.4s, v16.4s\n"
-    "ldr s25, [x13, %[input_col_stride1]]\n"
-    "fmla v4.4s, v21.4s, v11.4s\n"
-    "prfm pldl1keep, [x25, x23]\n"
-    "fmla v7.4s, v21.4s, v15.4s\n"
-    "ldr s26, [x22, x16]\n"
-    "fmla v1.4s, v19.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v4.4s, v19.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v3.4s, v19.4s, v16.4s\n"
-    "prfm pldl1keep, [x13, x17]\n"
-    "fmla v9.4s, v19.4s, v15.4s\n"
-    "ldr s23, [x25, x26]\n"
-    "fmla v4.4s, v31.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x23]\n"
-    "fmla v3.4s, v31.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x20]\n"
-    "fmla v2.4s, v31.4s, v15.4s\n"
-    "ldr s20, [%[inptr0], x9]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "ldr s28, [x24, %[input_col_stride1]]\n"
-    "fmla v4.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x17]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "prfm pldl1keep, [x13, x23]\n"
-    "fmla v3.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x22, x20]\n"
-    "fmla v7.4s, v18.4s, v0.4s\n"
-    "prfm pldl1keep, [x25, x15]\n"
-    "fmla v9.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x23]\n"
-    "fmla v6.4s, v18.4s, v15.4s\n"
-    "ldr s27, [x13, x16]\n"
-    "fmla v4.4s, v24.4s, v17.4s\n"
-    "prfm pldl1keep, [x13, x20]\n"
-    "fmla v1.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x22, x15]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "prfm pldl1keep, [x24, x20]\n"
-    "fmla v9.4s, v24.4s, v0.4s\n"
-    "prfm pldl1keep, [x13, x15]\n"
-    "fmla v2.4s, v24.4s, v16.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v8.4s, v24.4s, v15.4s\n"
-    "ldr s24, [x22, x26]\n"
-    "fmla v3.4s, v29.4s, v13.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v2.4s, v29.4s, v0.4s\n"
-    "ldr s22, [x25, x9]\n"
-    "fmla v7.4s, v30.4s, v11.4s\n"
-    "ldr s21, [x24, x16]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v9.4s, v25.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v7.4s, v25.4s, v12.4s\n"
-    "add x25, x25, #4\n"
-    "fmla v6.4s, v25.4s, v16.4s\n"
-    "ldr s19, [x13, x26]\n"
-    "fmla v4.4s, v26.4s, v14.4s\n"
-    "fmla v1.4s, v26.4s, v17.4s\n"
-    "fmla v3.4s, v26.4s, v10.4s\n"
-    "fmla v7.4s, v26.4s, v13.4s\n"
-    "fmla v9.4s, v26.4s, v12.4s\n"
-    "fmla v2.4s, v26.4s, v11.4s\n"
-    "fmla v6.4s, v26.4s, v0.4s\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "fmla v5.4s, v26.4s, v15.4s\n"
-    "ldr s26, [x22, x9]\n"
-    "fmla v3.4s, v23.4s, v17.4s\n"
-    "ldr s18, [x24, x26]\n"
-    "fmla v9.4s, v23.4s, v13.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v2.4s, v23.4s, v12.4s\n"
-    "fmla v8.4s, v23.4s, v0.4s\n"
-    "fmla v7.4s, v28.4s, v10.4s\n"
-    "ldr s23, [x13, x9]\n"
-    "fmla v6.4s, v28.4s, v11.4s\n"
-    "ldr s25, [x24, x9]\n"
-    "fmla v2.4s, v20.4s, v13.4s\n"
-    "add x13, x13, #4\n"
-    "fmla v1.4s, v27.4s, v14.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v7.4s, v27.4s, v17.4s\n"
-    "fmla v9.4s, v27.4s, v10.4s\n"
-    "fmla v6.4s, v27.4s, v12.4s\n"
-    "fmla v8.4s, v27.4s, v11.4s\n"
-    "fmla v5.4s, v27.4s, v16.4s\n"
-    "fmla v3.4s, v24.4s, v14.4s\n"
-    "fmla v9.4s, v24.4s, v17.4s\n"
-    "fmla v2.4s, v24.4s, v10.4s\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "fmla v8.4s, v24.4s, v12.4s\n"
-    "fmla v5.4s, v24.4s, v0.4s\n"
-    "fmla v7.4s, v21.4s, v14.4s\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "fmla v9.4s, v19.4s, v14.4s\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "fmla v6.4s, v21.4s, v10.4s\n"
-    "fmla v5.4s, v21.4s, v11.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v2.4s, v26.4s, v14.4s\n"
-    "fmla v6.4s, v19.4s, v17.4s\n"
-    "fmla v8.4s, v19.4s, v10.4s\n"
-    "fmla v5.4s, v19.4s, v12.4s\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "fmla v6.4s, v18.4s, v14.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "str s4, [%[outptr0]]\n"
-    "fmla v8.4s, v26.4s, v17.4s\n"
-    "str s3, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v5.4s, v26.4s, v13.4s\n"
-    "str s2, [%[outptr0], x19]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "fmla v8.4s, v23.4s, v14.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s1, [x21]\n"
-    "fmla v5.4s, v18.4s, v10.4s\n"
-    "fmax v9.4s, v9.4s, v29.4s\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v8.4s, v8.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "str s9, [x21, %[output_col_stride1]]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "str s8, [x21, x19]\n"
-    "str s7, [x14]\n"
-    "str s6, [x14, %[output_col_stride1]]\n"
-    "add x21, x21, #4\n"
-    "fmla v5.4s, v25.4s, v14.4s\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "str s5, [x14, x19]\n"
-    "add x14, x14, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x17, %[inptr0], %[input_row_stride]\n"
-    "add x9, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x25, %[outptr0], %[output_row_stride]\n"
-    "add x14, x17, %[input_row_stride]\n"
-    "add x22, x9, #64\n"
-    "add x15, x9, %[input_col_stride1]\n"
-    "add x21, x14, %[input_row_stride]\n"
-    "add x16, x15, #64\n"
-    "add x24, x15, %[input_col_stride1]\n"
-    "add x26, x21, %[input_row_stride]\n"
-    "add x23, x24, #64\n"
-    "add x13, x25, %[output_row_stride]\n"
-    "add x27, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x19, %[n_channels], #3\n"
-    "lsr x20, %[n_channels], #2\n"
-    "cbz x20, 4f\n"
-    "1:\n"
-    "ldr q19, [%[wbptr]]\n"
-    "subs x20, x20, #1\n"
-    "mov v8.16b, v19.16b\n"
-    "ldr q17, [%[wbptr], #16]\n"
-    "mov v5.16b, v19.16b\n"
-    "ldr q16, [%[wbptr], #32]\n"
-    "mov v7.16b, v19.16b\n"
-    "ldr q15, [%[wbptr], #48]\n"
-    "mov v2.16b, v19.16b\n"
-    "ldr q14, [%[wbptr], #64]\n"
-    "mov v4.16b, v19.16b\n"
-    "ldr q13, [%[wbptr], #80]\n"
-    "mov v6.16b, v19.16b\n"
-    "ldr q12, [%[wbptr], #96]\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr q11, [%[wbptr], #112]\n"
-    "mov v3.16b, v19.16b\n"
-    "ldr q10, [%[wbptr], #128]\n"
-    "mov v0.16b, v19.16b\n"
-    "ldr q9, [%[wbptr], #144]\n"
-    "ldr q25, [%[inptr0]]\n"
-    "ldr q27, [x17]\n"
-    "fmla v8.4s, v25.4s, v17.4s\n"
-    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q20, [x14]\n"
-    "ldr q22, [x17, %[input_col_stride1]]\n"
-    "ldr q28, [%[inptr0], x9]\n"
-    "ldr q23, [x21]\n"
-    "fmla v8.4s, v27.4s, v14.4s\n"
-    "ldr q18, [x14, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x17, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "prfm pldl1keep, [x14, #64]\n"
-    "prfm pldl1keep, [x17, x28]\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "prfm pldl1keep, [x14, x28]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v5.4s, v27.4s, v17.4s\n"
-    "ldr q27, [x17, x9]\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "ldr q30, [%[inptr0], x15]\n"
-    "fmla v7.4s, v26.4s, v17.4s\n"
-    "ldr q31, [x26]\n"
-    "fmla v5.4s, v20.4s, v14.4s\n"
-    "ldr q24, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x17, x22]\n"
-    "fmla v2.4s, v20.4s, v17.4s\n"
-    "ldr q29, [x14, x9]\n"
-    "fmla v5.4s, v22.4s, v16.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v7.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x21, x28]\n"
-    "fmla v4.4s, v22.4s, v17.4s\n"
-    "ldr q21, [x17, x15]\n"
-    "fmla v8.4s, v28.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x22]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x16]\n"
-    "fmla v6.4s, v28.4s, v17.4s\n"
-    "ldr q19, [%[inptr0], x24]\n"
-    "fmla v5.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "ldr q28, [x26, %[input_col_stride1]]\n"
-    "fmla v8.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x26, x28]\n"
-    "fmla v5.4s, v18.4s, v13.4s\n"
-    "prfm pldl1keep, [x21, x22]\n"
-    "fmla v7.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x14, x16]\n"
-    "fmla v2.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x23]\n"
-    "fmla v4.4s, v18.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x22]\n"
-    "fmla v1.4s, v18.4s, v17.4s\n"
-    "ldr q25, [x21, x9]\n"
-    "fmla v8.4s, v27.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v5.4s, v27.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x23]\n"
-    "fmla v7.4s, v27.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, x16]\n"
-    "fmla v4.4s, v27.4s, v16.4s\n"
-    "prfm pldl1keep, [x21, x23]\n"
-    "fmla v6.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x23]\n"
-    "fmla v3.4s, v27.4s, v17.4s\n"
-    "ldr q27, [x14, x15]\n"
-    "fmla v7.4s, v30.4s, v15.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v6.4s, v30.4s, v16.4s\n"
-    "ldr q26, [x17, x24]\n"
-    "fmla v2.4s, v31.4s, v11.4s\n"
-    "ldr q20, [x26, x9]\n"
-    "fmla v5.4s, v24.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v4.4s, v24.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v2.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v1.4s, v24.4s, v14.4s\n"
-    "ldr q18, [x21, x15]\n"
-    "fmla v8.4s, v29.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "fmla v5.4s, v29.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v7.4s, v29.4s, v10.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v2.4s, v29.4s, v15.4s\n"
-    "prfm pldl1keep, [x17, #64]\n"
-    "fmla v4.4s, v29.4s, v13.4s\n"
-    "prfm pldl1keep, [x17, x28]\n"
-    "fmla v6.4s, v29.4s, v11.4s\n"
-    "subs x20, x20, #1\n"
-    "fmla v1.4s, v29.4s, v16.4s\n"
-    "fmla v3.4s, v29.4s, v14.4s\n"
-    "fmla v0.4s, v29.4s, v17.4s\n"
-    "ldr q22, [x14, x24]\n"
-    "fmla v7.4s, v21.4s, v12.4s\n"
-    "ldr q23, [x26, x15]\n"
-    "fmla v4.4s, v21.4s, v15.4s\n"
-    "add x14, x14, #16\n"
-    "fmla v6.4s, v21.4s, v13.4s\n"
-    "prfm pldl1keep, [x14, #64]\n"
-    "fmla v3.4s, v21.4s, v16.4s\n"
-    "ldr q24, [x21, x24]\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "prfm pldl1keep, [x14, x28]\n"
-    "fmla v6.4s, v19.4s, v15.4s\n"
-    "ldr q21, [x26, x24]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "ldr q19, [%[wbptr]]\n"
-    "fmla v5.4s, v25.4s, v9.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v2.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v1.4s, v25.4s, v13.4s\n"
-    "fmla v3.4s, v25.4s, v11.4s\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "ldr q17, [%[wbptr], #16]\n"
-    "fmla v7.4s, v27.4s, v9.4s\n"
-    "ldr q25, [%[inptr0]]\n"
-    "fmla v4.4s, v27.4s, v12.4s\n"
-    "fmla v6.4s, v27.4s, v10.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v3.4s, v27.4s, v13.4s\n"
-    "fmla v0.4s, v27.4s, v16.4s\n"
-    "ldr q14, [%[wbptr], #64]\n"
-    "fmla v6.4s, v26.4s, v12.4s\n"
-    "ldr q27, [x17]\n"
-    "fmla v3.4s, v26.4s, v15.4s\n"
-    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v20.4s, v9.4s\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v0.4s, v20.4s, v11.4s\n"
-    "ldr q16, [%[wbptr], #32]\n"
-    "fmla v4.4s, v18.4s, v9.4s\n"
-    "ldr q20, [x14]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v18.4s, v13.4s\n"
-    "ldr q11, [%[wbptr], #112]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "movi v30.16b, #0\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v0.4s, v22.4s, v15.4s\n"
-    "ldr q13, [%[wbptr], #80]\n"
-    "fmov v29.4s, #6.0\n"
-    "fmax v8.4s, v8.4s, v30.4s\n"
-    "fmla v3.4s, v24.4s, v9.4s\n"
-    "fmax v7.4s, v7.4s, v30.4s\n"
-    "fmla v0.4s, v23.4s, v10.4s\n"
-    "ldr q15, [%[wbptr], #48]\n"
-    "fmin v8.4s, v8.4s, v29.4s\n"
-    "ldr q22, [x17, %[input_col_stride1]]\n"
-    "fmin v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v30.4s\n"
-    "str q8, [%[outptr0]]\n"
-    "fmla v0.4s, v24.4s, v12.4s\n"
-    "str q7, [%[outptr0], %[output_col_stride1]]\n"
-    "fmin v6.4s, v6.4s, v29.4s\n"
-    "fmax v5.4s, v5.4s, v30.4s\n"
-    "ldr q10, [%[wbptr], #128]\n"
-    "str q6, [%[outptr0], x27]\n"
-    "fmla v0.4s, v21.4s, v9.4s\n"
-    "fmin v5.4s, v5.4s, v29.4s\n"
-    "ldr q12, [%[wbptr], #96]\n"
-    "fmax v4.4s, v4.4s, v30.4s\n"
-    "ldr q28, [%[inptr0], x9]\n"
-    "str q5, [x25]\n"
-    "fmax v3.4s, v3.4s, v30.4s\n"
-    "fmin v4.4s, v4.4s, v29.4s\n"
-    "ldr q9, [%[wbptr], #144]\n"
-    "fmin v3.4s, v3.4s, v29.4s\n"
-    "ldr q23, [x21]\n"
-    "str q4, [x25, %[output_col_stride1]]\n"
-    "fmax v2.4s, v2.4s, v30.4s\n"
-    "str q3, [x25, x27]\n"
-    "fmax v1.4s, v1.4s, v30.4s\n"
-    "fmin v2.4s, v2.4s, v29.4s\n"
-    "ldr q18, [x14, %[input_col_stride1]]\n"
-    "fmin v1.4s, v1.4s, v29.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q2, [x13]\n"
-    "fmax v0.4s, v0.4s, v30.4s\n"
-    "str q1, [x13, %[output_col_stride1]]\n"
-    "mov v8.16b, v19.16b\n"
-    "fmin v0.4s, v0.4s, v29.4s\n"
-    "add x25, x25, #16\n"
-    "mov v5.16b, v19.16b\n"
-    "mov v7.16b, v19.16b\n"
-    "str q0, [x13, x27]\n"
-    "mov v2.16b, v19.16b\n"
-    "mov v4.16b, v19.16b\n"
-    "add x13, x13, #16\n"
-    "mov v6.16b, v19.16b\n"
-    "mov v1.16b, v19.16b\n"
-    "mov v3.16b, v19.16b\n"
-    "mov v0.16b, v19.16b\n"
-    "fmla v8.4s, v25.4s, v17.4s\n"
-    "fmla v8.4s, v27.4s, v14.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v5.4s, v27.4s, v17.4s\n"
-    "ldr q27, [x17, x9]\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "ldr q30, [%[inptr0], x15]\n"
-    "fmla v7.4s, v26.4s, v17.4s\n"
-    "ldr q31, [x26]\n"
-    "fmla v5.4s, v20.4s, v14.4s\n"
-    "ldr q24, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x17, x22]\n"
-    "fmla v2.4s, v20.4s, v17.4s\n"
-    "ldr q29, [x14, x9]\n"
-    "fmla v5.4s, v22.4s, v16.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v7.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x21, x28]\n"
-    "fmla v4.4s, v22.4s, v17.4s\n"
-    "ldr q21, [x17, x15]\n"
-    "fmla v8.4s, v28.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x22]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x16]\n"
-    "fmla v6.4s, v28.4s, v17.4s\n"
-    "ldr q19, [%[inptr0], x24]\n"
-    "fmla v5.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "ldr q28, [x26, %[input_col_stride1]]\n"
-    "fmla v8.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x26, x28]\n"
-    "fmla v5.4s, v18.4s, v13.4s\n"
-    "prfm pldl1keep, [x21, x22]\n"
-    "fmla v7.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x14, x16]\n"
-    "fmla v2.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x23]\n"
-    "fmla v4.4s, v18.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x22]\n"
-    "fmla v1.4s, v18.4s, v17.4s\n"
-    "ldr q25, [x21, x9]\n"
-    "fmla v8.4s, v27.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v5.4s, v27.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x23]\n"
-    "fmla v7.4s, v27.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, x16]\n"
-    "fmla v4.4s, v27.4s, v16.4s\n"
-    "prfm pldl1keep, [x21, x23]\n"
-    "fmla v6.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x23]\n"
-    "fmla v3.4s, v27.4s, v17.4s\n"
-    "ldr q27, [x14, x15]\n"
-    "fmla v7.4s, v30.4s, v15.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v6.4s, v30.4s, v16.4s\n"
-    "ldr q26, [x17, x24]\n"
-    "fmla v2.4s, v31.4s, v11.4s\n"
-    "ldr q20, [x26, x9]\n"
-    "fmla v5.4s, v24.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v4.4s, v24.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v2.4s, v24.4s, v13.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v1.4s, v24.4s, v14.4s\n"
-    "ldr q18, [x21, x15]\n"
-    "fmla v8.4s, v29.4s, v9.4s\n"
-    "fmla v5.4s, v29.4s, v12.4s\n"
-    "fmla v7.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v15.4s\n"
-    "fmla v4.4s, v29.4s, v13.4s\n"
-    "fmla v6.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v16.4s\n"
-    "fmla v3.4s, v29.4s, v14.4s\n"
-    "fmla v0.4s, v29.4s, v17.4s\n"
-    "ldr q22, [x14, x24]\n"
-    "fmla v7.4s, v21.4s, v12.4s\n"
-    "ldr q23, [x26, x15]\n"
-    "fmla v4.4s, v21.4s, v15.4s\n"
-    "add x14, x14, #16\n"
-    "fmla v6.4s, v21.4s, v13.4s\n"
-    "fmla v3.4s, v21.4s, v16.4s\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "ldr q24, [x21, x24]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "ldr q21, [x26, x24]\n"
-    "fmla v6.4s, v19.4s, v15.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v5.4s, v25.4s, v9.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v2.4s, v25.4s, v12.4s\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v13.4s\n"
-    "fmla v3.4s, v25.4s, v11.4s\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "fmla v7.4s, v27.4s, v9.4s\n"
-    "fmla v4.4s, v27.4s, v12.4s\n"
-    "fmla v6.4s, v27.4s, v10.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v3.4s, v27.4s, v13.4s\n"
-    "fmla v0.4s, v27.4s, v16.4s\n"
-    "fmla v2.4s, v20.4s, v9.4s\n"
-    "fmla v6.4s, v26.4s, v12.4s\n"
-    "fmla v4.4s, v18.4s, v9.4s\n"
-    "fmla v3.4s, v26.4s, v15.4s\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v0.4s, v20.4s, v11.4s\n"
-    "movi v30.16b, #0\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "fmov v29.4s, #6.0\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v18.4s, v13.4s\n"
-    "fmax v8.4s, v8.4s, v30.4s\n"
-    "fmax v7.4s, v7.4s, v30.4s\n"
-    "fmax v6.4s, v6.4s, v30.4s\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v0.4s, v22.4s, v15.4s\n"
-    "fmin v8.4s, v8.4s, v29.4s\n"
-    "fmin v7.4s, v7.4s, v29.4s\n"
-    "fmin v6.4s, v6.4s, v29.4s\n"
-    "str q8, [%[outptr0]]\n"
-    "fmla v3.4s, v24.4s, v9.4s\n"
-    "str q7, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v23.4s, v10.4s\n"
-    "str q6, [%[outptr0], x27]\n"
-    "fmax v5.4s, v5.4s, v30.4s\n"
-    "fmax v4.4s, v4.4s, v30.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v0.4s, v24.4s, v12.4s\n"
-    "fmin v5.4s, v5.4s, v29.4s\n"
-    "fmin v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v30.4s\n"
-    "str q5, [x25]\n"
-    "fmax v2.4s, v2.4s, v30.4s\n"
-    "str q4, [x25, %[output_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v9.4s\n"
-    "fmin v3.4s, v3.4s, v29.4s\n"
-    "fmin v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v30.4s\n"
-    "str q3, [x25, x27]\n"
-    "str q2, [x13]\n"
-    "fmin v1.4s, v1.4s, v29.4s\n"
-    "fmax v0.4s, v0.4s, v30.4s\n"
-    "add x25, x25, #16\n"
-    "str q1, [x13, %[output_col_stride1]]\n"
-    "fmin v0.4s, v0.4s, v29.4s\n"
-    "str q0, [x13, x27]\n"
-    "add x13, x13, #16\n"
-    "4:\n"
-    "cbz x19, 7f\n"
-    "ldr s19, [%[wbptr]]\n"
-    "mov v8.16b, v19.16b\n"
-    "ldr s17, [%[wbptr], #4]\n"
-    "mov v5.16b, v19.16b\n"
-    "ldr s16, [%[wbptr], #8]\n"
-    "mov v7.16b, v19.16b\n"
-    "ldr s15, [%[wbptr], #12]\n"
-    "mov v2.16b, v19.16b\n"
-    "ldr s14, [%[wbptr], #16]\n"
-    "mov v4.16b, v19.16b\n"
-    "ldr s13, [%[wbptr], #20]\n"
-    "mov v6.16b, v19.16b\n"
-    "ldr s12, [%[wbptr], #24]\n"
-    "mov v1.16b, v19.16b\n"
-    "ldr s11, [%[wbptr], #28]\n"
-    "mov v3.16b, v19.16b\n"
-    "ldr s10, [%[wbptr], #32]\n"
-    "mov v0.16b, v19.16b\n"
-    "ldr s9, [%[wbptr], #36]\n"
-    "ldr s25, [%[inptr0]]\n"
-    "subs x19, x19, #1\n"
-    "fmla v8.4s, v25.4s, v17.4s\n"
-    "ldr s27, [x17]\n"
-    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s20, [x14]\n"
-    "ldr s22, [x17, %[input_col_stride1]]\n"
-    "ldr s28, [%[inptr0], x9]\n"
-    "fmla v8.4s, v27.4s, v14.4s\n"
-    "ldr s23, [x21]\n"
-    "ldr s18, [x14, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x17, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "prfm pldl1keep, [x14, #64]\n"
-    "prfm pldl1keep, [x17, x28]\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "prfm pldl1keep, [x14, x28]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v5.4s, v27.4s, v17.4s\n"
-    "ldr s27, [x17, x9]\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "ldr s30, [%[inptr0], x15]\n"
-    "fmla v7.4s, v26.4s, v17.4s\n"
-    "ldr s31, [x26]\n"
-    "fmla v5.4s, v20.4s, v14.4s\n"
-    "ldr s24, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x17, x22]\n"
-    "fmla v2.4s, v20.4s, v17.4s\n"
-    "ldr s29, [x14, x9]\n"
-    "fmla v5.4s, v22.4s, v16.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v7.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x21, x28]\n"
-    "fmla v4.4s, v22.4s, v17.4s\n"
-    "ldr s21, [x17, x15]\n"
-    "fmla v8.4s, v28.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x22]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x16]\n"
-    "fmla v6.4s, v28.4s, v17.4s\n"
-    "ldr s19, [%[inptr0], x24]\n"
-    "fmla v5.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "ldr s28, [x26, %[input_col_stride1]]\n"
-    "fmla v8.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x26, x28]\n"
-    "fmla v5.4s, v18.4s, v13.4s\n"
-    "prfm pldl1keep, [x21, x22]\n"
-    "fmla v7.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x14, x16]\n"
-    "fmla v2.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x23]\n"
-    "fmla v4.4s, v18.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x22]\n"
-    "fmla v1.4s, v18.4s, v17.4s\n"
-    "ldr s25, [x21, x9]\n"
-    "fmla v8.4s, v27.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v5.4s, v27.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x23]\n"
-    "fmla v7.4s, v27.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, x16]\n"
-    "fmla v4.4s, v27.4s, v16.4s\n"
-    "prfm pldl1keep, [x21, x23]\n"
-    "fmla v6.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x23]\n"
-    "fmla v3.4s, v27.4s, v17.4s\n"
-    "ldr s27, [x14, x15]\n"
-    "fmla v7.4s, v30.4s, v15.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v6.4s, v30.4s, v16.4s\n"
-    "ldr s26, [x17, x24]\n"
-    "fmla v2.4s, v31.4s, v11.4s\n"
-    "ldr s20, [x26, x9]\n"
-    "fmla v5.4s, v24.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v4.4s, v24.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v2.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v1.4s, v24.4s, v14.4s\n"
-    "ldr s18, [x21, x15]\n"
-    "fmla v8.4s, v29.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "fmla v5.4s, v29.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v7.4s, v29.4s, v10.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v2.4s, v29.4s, v15.4s\n"
-    "prfm pldl1keep, [x17, #64]\n"
-    "fmla v4.4s, v29.4s, v13.4s\n"
-    "prfm pldl1keep, [x17, x28]\n"
-    "fmla v6.4s, v29.4s, v11.4s\n"
-    "subs x19, x19, #1\n"
-    "fmla v1.4s, v29.4s, v16.4s\n"
-    "fmla v3.4s, v29.4s, v14.4s\n"
-    "fmla v0.4s, v29.4s, v17.4s\n"
-    "ldr s22, [x14, x24]\n"
-    "fmla v7.4s, v21.4s, v12.4s\n"
-    "ldr s23, [x26, x15]\n"
-    "fmla v4.4s, v21.4s, v15.4s\n"
-    "add x14, x14, #4\n"
-    "fmla v6.4s, v21.4s, v13.4s\n"
-    "prfm pldl1keep, [x14, #64]\n"
-    "fmla v3.4s, v21.4s, v16.4s\n"
-    "ldr s24, [x21, x24]\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "prfm pldl1keep, [x14, x28]\n"
-    "fmla v6.4s, v19.4s, v15.4s\n"
-    "ldr s21, [x26, x24]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "ldr s19, [%[wbptr]]\n"
-    "fmla v5.4s, v25.4s, v9.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v2.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, #64]\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v1.4s, v25.4s, v13.4s\n"
-    "fmla v3.4s, v25.4s, v11.4s\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "ldr s17, [%[wbptr], #4]\n"
-    "fmla v7.4s, v27.4s, v9.4s\n"
-    "ldr s25, [%[inptr0]]\n"
-    "fmla v4.4s, v27.4s, v12.4s\n"
-    "fmla v6.4s, v27.4s, v10.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v3.4s, v27.4s, v13.4s\n"
-    "fmla v0.4s, v27.4s, v16.4s\n"
-    "ldr s14, [%[wbptr], #16]\n"
-    "fmla v6.4s, v26.4s, v12.4s\n"
-    "ldr s27, [x17]\n"
-    "fmla v3.4s, v26.4s, v15.4s\n"
-    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v2.4s, v20.4s, v9.4s\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v0.4s, v20.4s, v11.4s\n"
-    "ldr s16, [%[wbptr], #8]\n"
-    "fmla v4.4s, v18.4s, v9.4s\n"
-    "ldr s20, [x14]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v18.4s, v13.4s\n"
-    "ldr s11, [%[wbptr], #28]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "movi v30.16b, #0\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v0.4s, v22.4s, v15.4s\n"
-    "ldr s13, [%[wbptr], #20]\n"
-    "fmov v29.4s, #6.0\n"
-    "fmax v8.4s, v8.4s, v30.4s\n"
-    "fmla v3.4s, v24.4s, v9.4s\n"
-    "fmax v7.4s, v7.4s, v30.4s\n"
-    "fmla v0.4s, v23.4s, v10.4s\n"
-    "ldr s15, [%[wbptr], #12]\n"
-    "fmin v8.4s, v8.4s, v29.4s\n"
-    "ldr s22, [x17, %[input_col_stride1]]\n"
-    "fmin v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v30.4s\n"
-    "str s8, [%[outptr0]]\n"
-    "fmla v0.4s, v24.4s, v12.4s\n"
-    "str s7, [%[outptr0], %[output_col_stride1]]\n"
-    "fmin v6.4s, v6.4s, v29.4s\n"
-    "fmax v5.4s, v5.4s, v30.4s\n"
-    "ldr s10, [%[wbptr], #32]\n"
-    "str s6, [%[outptr0], x27]\n"
-    "fmla v0.4s, v21.4s, v9.4s\n"
-    "fmin v5.4s, v5.4s, v29.4s\n"
-    "ldr s12, [%[wbptr], #24]\n"
-    "fmax v4.4s, v4.4s, v30.4s\n"
-    "ldr s28, [%[inptr0], x9]\n"
-    "str s5, [x25]\n"
-    "fmax v3.4s, v3.4s, v30.4s\n"
-    "fmin v4.4s, v4.4s, v29.4s\n"
-    "ldr s9, [%[wbptr], #36]\n"
-    "fmin v3.4s, v3.4s, v29.4s\n"
-    "ldr s23, [x21]\n"
-    "str s4, [x25, %[output_col_stride1]]\n"
-    "fmax v2.4s, v2.4s, v30.4s\n"
-    "str s3, [x25, x27]\n"
-    "fmax v1.4s, v1.4s, v30.4s\n"
-    "fmin v2.4s, v2.4s, v29.4s\n"
-    "ldr s18, [x14, %[input_col_stride1]]\n"
-    "fmin v1.4s, v1.4s, v29.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s2, [x13]\n"
-    "fmax v0.4s, v0.4s, v30.4s\n"
-    "str s1, [x13, %[output_col_stride1]]\n"
-    "mov v8.16b, v19.16b\n"
-    "fmin v0.4s, v0.4s, v29.4s\n"
-    "add x25, x25, #4\n"
-    "mov v5.16b, v19.16b\n"
-    "mov v7.16b, v19.16b\n"
-    "str s0, [x13, x27]\n"
-    "mov v2.16b, v19.16b\n"
-    "mov v4.16b, v19.16b\n"
-    "add x13, x13, #4\n"
-    "mov v6.16b, v19.16b\n"
-    "mov v1.16b, v19.16b\n"
-    "mov v3.16b, v19.16b\n"
-    "mov v0.16b, v19.16b\n"
-    "fmla v8.4s, v25.4s, v17.4s\n"
-    "fmla v8.4s, v27.4s, v14.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v5.4s, v27.4s, v17.4s\n"
-    "ldr s27, [x17, x9]\n"
-    "fmla v8.4s, v26.4s, v16.4s\n"
-    "ldr s30, [%[inptr0], x15]\n"
-    "fmla v7.4s, v26.4s, v17.4s\n"
-    "ldr s31, [x26]\n"
-    "fmla v5.4s, v20.4s, v14.4s\n"
-    "ldr s24, [x21, %[input_col_stride1]]\n"
-    "fmla v8.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x17, x22]\n"
-    "fmla v2.4s, v20.4s, v17.4s\n"
-    "ldr s29, [x14, x9]\n"
-    "fmla v5.4s, v22.4s, v16.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v8.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v7.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x21, x28]\n"
-    "fmla v4.4s, v22.4s, v17.4s\n"
-    "ldr s21, [x17, x15]\n"
-    "fmla v8.4s, v28.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x22]\n"
-    "fmla v7.4s, v28.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x16]\n"
-    "fmla v6.4s, v28.4s, v17.4s\n"
-    "ldr s19, [%[inptr0], x24]\n"
-    "fmla v5.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [%[inptr0], x23]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "ldr s28, [x26, %[input_col_stride1]]\n"
-    "fmla v8.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x26, x28]\n"
-    "fmla v5.4s, v18.4s, v13.4s\n"
-    "prfm pldl1keep, [x21, x22]\n"
-    "fmla v7.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x14, x16]\n"
-    "fmla v2.4s, v18.4s, v16.4s\n"
-    "prfm pldl1keep, [x17, x23]\n"
-    "fmla v4.4s, v18.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x22]\n"
-    "fmla v1.4s, v18.4s, v17.4s\n"
-    "ldr s25, [x21, x9]\n"
-    "fmla v8.4s, v27.4s, v12.4s\n"
-    "prfm pldl1keep, [x21, x16]\n"
-    "fmla v5.4s, v27.4s, v15.4s\n"
-    "prfm pldl1keep, [x14, x23]\n"
-    "fmla v7.4s, v27.4s, v13.4s\n"
-    "prfm pldl1keep, [x26, x16]\n"
-    "fmla v4.4s, v27.4s, v16.4s\n"
-    "prfm pldl1keep, [x21, x23]\n"
-    "fmla v6.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x23]\n"
-    "fmla v3.4s, v27.4s, v17.4s\n"
-    "ldr s27, [x14, x15]\n"
-    "fmla v7.4s, v30.4s, v15.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v6.4s, v30.4s, v16.4s\n"
-    "ldr s26, [x17, x24]\n"
-    "fmla v2.4s, v31.4s, v11.4s\n"
-    "ldr s20, [x26, x9]\n"
-    "fmla v5.4s, v24.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v4.4s, v24.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v2.4s, v24.4s, v13.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v1.4s, v24.4s, v14.4s\n"
-    "ldr s18, [x21, x15]\n"
-    "fmla v8.4s, v29.4s, v9.4s\n"
-    "fmla v5.4s, v29.4s, v12.4s\n"
-    "fmla v7.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v15.4s\n"
-    "fmla v4.4s, v29.4s, v13.4s\n"
-    "fmla v6.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v16.4s\n"
-    "fmla v3.4s, v29.4s, v14.4s\n"
-    "fmla v0.4s, v29.4s, v17.4s\n"
-    "ldr s22, [x14, x24]\n"
-    "fmla v7.4s, v21.4s, v12.4s\n"
-    "ldr s23, [x26, x15]\n"
-    "fmla v4.4s, v21.4s, v15.4s\n"
-    "add x14, x14, #4\n"
-    "fmla v6.4s, v21.4s, v13.4s\n"
-    "fmla v3.4s, v21.4s, v16.4s\n"
-    "fmla v2.4s, v28.4s, v10.4s\n"
-    "ldr s24, [x21, x24]\n"
-    "fmla v1.4s, v28.4s, v11.4s\n"
-    "ldr s21, [x26, x24]\n"
-    "fmla v6.4s, v19.4s, v15.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v5.4s, v25.4s, v9.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v2.4s, v25.4s, v12.4s\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v13.4s\n"
-    "fmla v3.4s, v25.4s, v11.4s\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "fmla v7.4s, v27.4s, v9.4s\n"
-    "fmla v4.4s, v27.4s, v12.4s\n"
-    "fmla v6.4s, v27.4s, v10.4s\n"
-    "fmla v1.4s, v27.4s, v15.4s\n"
-    "fmla v3.4s, v27.4s, v13.4s\n"
-    "fmla v0.4s, v27.4s, v16.4s\n"
-    "fmla v2.4s, v20.4s, v9.4s\n"
-    "fmla v6.4s, v26.4s, v12.4s\n"
-    "fmla v4.4s, v18.4s, v9.4s\n"
-    "fmla v3.4s, v26.4s, v15.4s\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v0.4s, v20.4s, v11.4s\n"
-    "movi v30.16b, #0\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "fmov v29.4s, #6.0\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v18.4s, v13.4s\n"
-    "fmax v8.4s, v8.4s, v30.4s\n"
-    "fmax v7.4s, v7.4s, v30.4s\n"
-    "fmax v6.4s, v6.4s, v30.4s\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v0.4s, v22.4s, v15.4s\n"
-    "fmin v8.4s, v8.4s, v29.4s\n"
-    "fmin v7.4s, v7.4s, v29.4s\n"
-    "fmin v6.4s, v6.4s, v29.4s\n"
-    "str s8, [%[outptr0]]\n"
-    "fmla v3.4s, v24.4s, v9.4s\n"
-    "str s7, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v0.4s, v23.4s, v10.4s\n"
-    "str s6, [%[outptr0], x27]\n"
-    "fmax v5.4s, v5.4s, v30.4s\n"
-    "fmax v4.4s, v4.4s, v30.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v0.4s, v24.4s, v12.4s\n"
-    "fmin v5.4s, v5.4s, v29.4s\n"
-    "fmin v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v30.4s\n"
-    "str s5, [x25]\n"
-    "fmax v2.4s, v2.4s, v30.4s\n"
-    "str s4, [x25, %[output_col_stride1]]\n"
-    "fmla v0.4s, v21.4s, v9.4s\n"
-    "fmin v3.4s, v3.4s, v29.4s\n"
-    "fmin v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v30.4s\n"
-    "str s3, [x25, x27]\n"
-    "str s2, [x13]\n"
-    "fmin v1.4s, v1.4s, v29.4s\n"
-    "fmax v0.4s, v0.4s, v30.4s\n"
-    "add x25, x25, #4\n"
-    "str s1, [x13, %[output_col_stride1]]\n"
-    "fmin v0.4s, v0.4s, v29.4s\n"
-    "str s0, [x13, x27]\n"
-    "add x13, x13, #4\n"
-    "7:\n"
-    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
-    : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
deleted file mode 100644
index b798b8c..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ /dev/null

@@ -1,769 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void* weight_bias_ptr,
-  const float* input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float* output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x15, %[inptr0], %[input_row_stride]\n"
-    "add x26, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x21, %[outptr0], %[output_row_stride]\n"
-    "add x16, x15, %[input_row_stride]\n"
-    "add x27, x26, %[input_col_stride1]\n"
-    "add x22, x21, %[output_row_stride]\n"
-    "add x17, x16, %[input_row_stride]\n"
-    "add x28, x27, %[input_col_stride1]\n"
-    "add x23, %[output_col_stride1], %[output_col_stride1]\n"
-    "add x9, x17, %[input_row_stride]\n"
-    "add x13, x28, %[input_col_stride1]\n"
-    "and x24, %[n_channels], #3\n"
-    "add x19, x9, %[input_row_stride]\n"
-    "add x14, x13, %[input_col_stride1]\n"
-    "lsr x25, %[n_channels], #2\n"
-    "add x20, x19, %[input_row_stride]\n"
-    "cbz x25, 4f\n"
-    "1:\n"
-    "ldr q27, [%[wbptr]]\n"
-    "subs x25, x25, #1\n"
-    "mov v17.16b, v27.16b\n"
-    "ldr q6, [%[wbptr], #16]\n"
-    "mov v16.16b, v27.16b\n"
-    "ldr q14, [%[wbptr], #32]\n"
-    "mov v15.16b, v27.16b\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "mov v2.16b, v27.16b\n"
-    "ldr q12, [%[wbptr], #64]\n"
-    "mov v4.16b, v27.16b\n"
-    "ldr q11, [%[wbptr], #80]\n"
-    "mov v5.16b, v27.16b\n"
-    "ldr q10, [%[wbptr], #96]\n"
-    "mov v1.16b, v27.16b\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "mov v3.16b, v27.16b\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "mov v0.16b, v27.16b\n"
-    "ldr q7, [%[wbptr], #144]\n"
-    "ldr q29, [%[inptr0]]\n"
-    "ldr q28, [x15]\n"
-    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q22, [x16]\n"
-    "ldr q20, [x15, %[input_col_stride1]]\n"
-    "ldr q19, [%[inptr0], x26]\n"
-    "ldr q30, [x17]\n"
-    "ldr q18, [x16, %[input_col_stride1]]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v17.4s, v29.4s, v6.4s\n"
-    "ldr q21, [x15, x26]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "ldr q27, [%[inptr0], x27]\n"
-    "fmla v15.4s, v19.4s, v6.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v28.4s, v12.4s\n"
-    "ldr q25, [x9]\n"
-    "fmla v16.4s, v30.4s, v12.4s\n"
-    "ldr q24, [x17, %[input_col_stride1]]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v17.4s, v26.4s, v14.4s\n"
-    "ldr q23, [x16, x26]\n"
-    "fmla v16.4s, v18.4s, v14.4s\n"
-    "subs x25, x25, #1\n"
-    "fmla v15.4s, v27.4s, v14.4s\n"
-    "ldr q26, [x15, x27]\n"
-    "fmla v17.4s, v22.4s, v9.4s\n"
-    "ldr q22, [%[inptr0], x28]\n"
-    "fmla v16.4s, v25.4s, v9.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v23.4s, v9.4s\n"
-    "ldr q30, [x19]\n"
-    "fmla v17.4s, v20.4s, v11.4s\n"
-    "ldr q29, [x9, %[input_col_stride1]]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "ldr q28, [x17, x26]\n"
-    "fmla v4.4s, v23.4s, v6.4s\n"
-    "fmla v15.4s, v26.4s, v11.4s\n"
-    "fmla v17.4s, v19.4s, v13.4s\n"
-    "ldr q24, [x16, x27]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "ldr q25, [x15, x28]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "fmla v5.4s, v22.4s, v6.4s\n"
-    "fmla v17.4s, v18.4s, v8.4s\n"
-    "ldr q19, [%[inptr0], x13]\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr q18, [x20]\n"
-    "fmla v16.4s, v29.4s, v8.4s\n"
-    "ldr q22, [x19, %[input_col_stride1]]\n"
-    "fmla v17.4s, v21.4s, v10.4s\n"
-    "ldr q26, [x9, x26]\n"
-    "fmla v2.4s, v29.4s, v14.4s\n"
-    "ldr q20, [x17, x27]\n"
-    "fmla v16.4s, v28.4s, v10.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v17.4s, v23.4s, v7.4s\n"
-    "ldr q27, [x16, x28]\n"
-    "fmla v15.4s, v24.4s, v8.4s\n"
-    "ldr q30, [x15, x13]\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "ldr q24, [%[inptr0], x14]\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v5.4s, v25.4s, v12.4s\n"
-    "fmla v15.4s, v25.4s, v10.4s\n"
-    "ldr q28, [x20, %[input_col_stride1]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "ldr q17, [x19, x26]\n"
-    "fmla v5.4s, v19.4s, v14.4s\n"
-    "ldr q18, [x9, x27]\n"
-    "fmla v16.4s, v26.4s, v7.4s\n"
-    "ldr q25, [x17, x28]\n"
-    "fmla v2.4s, v22.4s, v11.4s\n"
-    "ldr q22, [x16, x13]\n"
-    "fmla v4.4s, v26.4s, v9.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "str q16, [x21]\n"
-    "fmla v1.4s, v26.4s, v6.4s\n"
-    "fmla v2.4s, v26.4s, v13.4s\n"
-    "ldr q21, [x15, x14]\n"
-    "fmla v4.4s, v20.4s, v11.4s\n"
-    "ldr q23, [x20, x26]\n"
-    "fmla v15.4s, v27.4s, v7.4s\n"
-    "ldr q19, [x19, x27]\n"
-    "fmla v5.4s, v27.4s, v9.4s\n"
-    "add x15, x15, #16\n"
-    "fmla v4.4s, v27.4s, v13.4s\n"
-    "fmla v3.4s, v27.4s, v6.4s\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v2.4s, v28.4s, v8.4s\n"
-    "fmla v5.4s, v30.4s, v11.4s\n"
-    "ldr q29, [x9, x28]\n"
-    "fmla v1.4s, v17.4s, v12.4s\n"
-    "ldr q27, [x17, x13]\n"
-    "fmla v2.4s, v17.4s, v10.4s\n"
-    "ldr q28, [x16, x14]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "ldr q26, [x20, x27]\n"
-    "fmla v4.4s, v18.4s, v8.4s\n"
-    "ldr q20, [x19, x28]\n"
-    "fmla v1.4s, v18.4s, v14.4s\n"
-    "ldr q17, [x9, x13]\n"
-    "fmla v3.4s, v25.4s, v12.4s\n"
-    "ldr q18, [x17, x14]\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "ldr q16, [x20, x28]\n"
-    "fmla v5.4s, v22.4s, v8.4s\n"
-    "add x16, x16, #16\n"
-    "fmla v3.4s, v22.4s, v14.4s\n"
-    "ldr q15, [x19, x13]\n"
-    "fmla v2.4s, v23.4s, v7.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v5.4s, v21.4s, v10.4s\n"
-    "ldr q21, [x9, x14]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr q23, [x20, x13]\n"
-    "str q2, [x22]\n"
-    "fmla v4.4s, v29.4s, v7.4s\n"
-    "fmla v3.4s, v29.4s, v9.4s\n"
-    "ldr q24, [x19, x14]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "ldr q25, [x20, x14]\n"
-    "str q4, [x21, %[output_col_stride1]]\n"
-    "fmla v0.4s, v29.4s, v6.4s\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "ldr q27, [%[wbptr]]\n"
-    "fmla v1.4s, v29.4s, v13.4s\n"
-    "ldr q29, [%[inptr0]]\n"
-    "fmla v5.4s, v28.4s, v7.4s\n"
-    "ldr q6, [%[wbptr], #16]\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "ldr q28, [x15]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
-    "str q5, [%[outptr0], x23]\n"
-    "fmla v0.4s, v20.4s, v12.4s\n"
-    "fmla v3.4s, v17.4s, v8.4s\n"
-    "ldr q22, [x16]\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "ldr q20, [x15, %[input_col_stride1]]\n"
-    "fmla v0.4s, v17.4s, v14.4s\n"
-    "ldr q12, [%[wbptr], #64]\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "ldr q19, [%[inptr0], x26]\n"
-    "fmla v1.4s, v16.4s, v7.4s\n"
-    "ldr q30, [x17]\n"
-    "fmla v0.4s, v16.4s, v9.4s\n"
-    "ldr q14, [%[wbptr], #32]\n"
-    "fmla v3.4s, v21.4s, v7.4s\n"
-    "ldr q18, [x16, %[input_col_stride1]]\n"
-    "str q1, [x22, %[output_col_stride1]]\n"
-    "mov v17.16b, v27.16b\n"
-    "fmla v0.4s, v15.4s, v11.4s\n"
-    "ldr q9, [%[wbptr], #112]\n"
-    "str q3, [x21, x23]\n"
-    "mov v16.16b, v27.16b\n"
-    "mov v15.16b, v27.16b\n"
-    "add x9, x9, #16\n"
-    "fmla v0.4s, v21.4s, v13.4s\n"
-    "ldr q11, [%[wbptr], #80]\n"
-    "mov v2.16b, v27.16b\n"
-    "add x19, x19, #16\n"
-    "mov v4.16b, v27.16b\n"
-    "add x20, x20, #16\n"
-    "fmla v0.4s, v23.4s, v8.4s\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "mov v5.16b, v27.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "mov v1.16b, v27.16b\n"
-    "add x21, x21, #16\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "ldr q8, [%[wbptr], #128]\n"
-    "mov v3.16b, v27.16b\n"
-    "fmla v0.4s, v25.4s, v7.4s\n"
-    "ldr q10, [%[wbptr], #96]\n"
-    "str q0, [x22, x23]\n"
-    "mov v0.16b, v27.16b\n"
-    "ldr q7, [%[wbptr], #144]\n"
-    "add x22, x22, #16\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v17.4s, v29.4s, v6.4s\n"
-    "ldr q21, [x15, x26]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "ldr q27, [%[inptr0], x27]\n"
-    "fmla v15.4s, v19.4s, v6.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v28.4s, v12.4s\n"
-    "ldr q25, [x9]\n"
-    "fmla v16.4s, v30.4s, v12.4s\n"
-    "ldr q24, [x17, %[input_col_stride1]]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v17.4s, v26.4s, v14.4s\n"
-    "ldr q23, [x16, x26]\n"
-    "fmla v16.4s, v18.4s, v14.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v27.4s, v14.4s\n"
-    "ldr q26, [x15, x27]\n"
-    "fmla v17.4s, v22.4s, v9.4s\n"
-    "ldr q22, [%[inptr0], x28]\n"
-    "fmla v16.4s, v25.4s, v9.4s\n"
-    "ldr q30, [x19]\n"
-    "fmla v15.4s, v23.4s, v9.4s\n"
-    "fmla v4.4s, v23.4s, v6.4s\n"
-    "fmla v17.4s, v20.4s, v11.4s\n"
-    "ldr q29, [x9, %[input_col_stride1]]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "ldr q28, [x17, x26]\n"
-    "fmla v15.4s, v26.4s, v11.4s\n"
-    "ldr q24, [x16, x27]\n"
-    "fmla v17.4s, v19.4s, v13.4s\n"
-    "ldr q25, [x15, x28]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "ldr q19, [%[inptr0], x13]\n"
-    "fmla v17.4s, v18.4s, v8.4s\n"
-    "ldr q18, [x20]\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr q22, [x19, %[input_col_stride1]]\n"
-    "fmla v16.4s, v29.4s, v8.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v17.4s, v21.4s, v10.4s\n"
-    "ldr q26, [x9, x26]\n"
-    "fmla v2.4s, v29.4s, v14.4s\n"
-    "ldr q20, [x17, x27]\n"
-    "fmla v16.4s, v28.4s, v10.4s\n"
-    "ldr q27, [x16, x28]\n"
-    "fmla v17.4s, v23.4s, v7.4s\n"
-    "ldr q30, [x15, x13]\n"
-    "fmla v15.4s, v24.4s, v8.4s\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "fmla v5.4s, v25.4s, v12.4s\n"
-    "ldr q24, [%[inptr0], x14]\n"
-    "str q17, [%[outptr0]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "fmla v15.4s, v25.4s, v10.4s\n"
-    "ldr q28, [x20, %[input_col_stride1]]\n"
-    "fmla v5.4s, v19.4s, v14.4s\n"
-    "ldr q17, [x19, x26]\n"
-    "fmla v2.4s, v22.4s, v11.4s\n"
-    "ldr q18, [x9, x27]\n"
-    "fmla v16.4s, v26.4s, v7.4s\n"
-    "ldr q25, [x17, x28]\n"
-    "fmla v4.4s, v26.4s, v9.4s\n"
-    "ldr q22, [x16, x13]\n"
-    "fmla v2.4s, v26.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "str q16, [x21]\n"
-    "fmla v1.4s, v26.4s, v6.4s\n"
-    "fmla v4.4s, v20.4s, v11.4s\n"
-    "ldr q21, [x15, x14]\n"
-    "fmla v15.4s, v27.4s, v7.4s\n"
-    "ldr q23, [x20, x26]\n"
-    "fmla v5.4s, v27.4s, v9.4s\n"
-    "ldr q19, [x19, x27]\n"
-    "fmla v4.4s, v27.4s, v13.4s\n"
-    "add x15, x15, #16\n"
-    "str q15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v3.4s, v27.4s, v6.4s\n"
-    "fmla v5.4s, v30.4s, v11.4s\n"
-    "ldr q29, [x9, x28]\n"
-    "fmla v2.4s, v28.4s, v8.4s\n"
-    "ldr q27, [x17, x13]\n"
-    "fmla v1.4s, v17.4s, v12.4s\n"
-    "ldr q28, [x16, x14]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "ldr q26, [x20, x27]\n"
-    "fmla v2.4s, v17.4s, v10.4s\n"
-    "ldr q20, [x19, x28]\n"
-    "fmla v4.4s, v18.4s, v8.4s\n"
-    "ldr q17, [x9, x13]\n"
-    "fmla v1.4s, v18.4s, v14.4s\n"
-    "ldr q18, [x17, x14]\n"
-    "fmla v3.4s, v25.4s, v12.4s\n"
-    "add x16, x16, #16\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "ldr q16, [x20, x28]\n"
-    "fmla v5.4s, v22.4s, v8.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v3.4s, v22.4s, v14.4s\n"
-    "ldr q15, [x19, x13]\n"
-    "fmla v2.4s, v23.4s, v7.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v5.4s, v21.4s, v10.4s\n"
-    "ldr q21, [x9, x14]\n"
-    "fmla v4.4s, v29.4s, v7.4s\n"
-    "ldr q23, [x20, x13]\n"
-    "str q2, [x22]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v3.4s, v29.4s, v9.4s\n"
-    "ldr q24, [x19, x14]\n"
-    "str q4, [x21, %[output_col_stride1]]\n"
-    "fmla v0.4s, v29.4s, v6.4s\n"
-    "fmla v1.4s, v29.4s, v13.4s\n"
-    "ldr q25, [x20, x14]\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v5.4s, v28.4s, v7.4s\n"
-    "add x19, x19, #16\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "add x20, x20, #16\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "fmla v0.4s, v20.4s, v12.4s\n"
-    "str q5, [%[outptr0], x23]\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v3.4s, v17.4s, v8.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v0.4s, v17.4s, v14.4s\n"
-    "fmla v1.4s, v16.4s, v7.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v16.4s, v9.4s\n"
-    "str q1, [x22, %[output_col_stride1]]\n"
-    "fmla v3.4s, v21.4s, v7.4s\n"
-    "fmla v0.4s, v15.4s, v11.4s\n"
-    "str q3, [x21, x23]\n"
-    "fmla v0.4s, v21.4s, v13.4s\n"
-    "add x21, x21, #16\n"
-    "fmla v0.4s, v23.4s, v8.4s\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v0.4s, v25.4s, v7.4s\n"
-    "str q0, [x22, x23]\n"
-    "add x22, x22, #16\n"
-    "4:\n"
-    "cbz x24, 7f\n"
-    "ldr s27, [%[wbptr]]\n"
-    "mov v17.16b, v27.16b\n"
-    "ldr s6, [%[wbptr], #4]\n"
-    "mov v16.16b, v27.16b\n"
-    "ldr s14, [%[wbptr], #8]\n"
-    "mov v15.16b, v27.16b\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "mov v2.16b, v27.16b\n"
-    "ldr s12, [%[wbptr], #16]\n"
-    "mov v4.16b, v27.16b\n"
-    "ldr s11, [%[wbptr], #20]\n"
-    "mov v5.16b, v27.16b\n"
-    "ldr s10, [%[wbptr], #24]\n"
-    "mov v1.16b, v27.16b\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "mov v3.16b, v27.16b\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "mov v0.16b, v27.16b\n"
-    "ldr s7, [%[wbptr], #36]\n"
-    "ldr s29, [%[inptr0]]\n"
-    "subs x24, x24, #1\n"
-    "ldr s28, [x15]\n"
-    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr s22, [x16]\n"
-    "ldr s20, [x15, %[input_col_stride1]]\n"
-    "ldr s19, [%[inptr0], x26]\n"
-    "ldr s30, [x17]\n"
-    "ldr s18, [x16, %[input_col_stride1]]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v17.4s, v29.4s, v6.4s\n"
-    "ldr s21, [x15, x26]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "ldr s27, [%[inptr0], x27]\n"
-    "fmla v15.4s, v19.4s, v6.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v28.4s, v12.4s\n"
-    "ldr s25, [x9]\n"
-    "fmla v16.4s, v30.4s, v12.4s\n"
-    "ldr s24, [x17, %[input_col_stride1]]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v17.4s, v26.4s, v14.4s\n"
-    "ldr s23, [x16, x26]\n"
-    "fmla v16.4s, v18.4s, v14.4s\n"
-    "subs x24, x24, #1\n"
-    "fmla v15.4s, v27.4s, v14.4s\n"
-    "ldr s26, [x15, x27]\n"
-    "fmla v17.4s, v22.4s, v9.4s\n"
-    "ldr s22, [%[inptr0], x28]\n"
-    "fmla v16.4s, v25.4s, v9.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v23.4s, v9.4s\n"
-    "ldr s30, [x19]\n"
-    "fmla v17.4s, v20.4s, v11.4s\n"
-    "ldr s29, [x9, %[input_col_stride1]]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "ldr s28, [x17, x26]\n"
-    "fmla v4.4s, v23.4s, v6.4s\n"
-    "fmla v15.4s, v26.4s, v11.4s\n"
-    "fmla v17.4s, v19.4s, v13.4s\n"
-    "ldr s24, [x16, x27]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "ldr s25, [x15, x28]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "fmla v5.4s, v22.4s, v6.4s\n"
-    "fmla v17.4s, v18.4s, v8.4s\n"
-    "ldr s19, [%[inptr0], x13]\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr s18, [x20]\n"
-    "fmla v16.4s, v29.4s, v8.4s\n"
-    "ldr s22, [x19, %[input_col_stride1]]\n"
-    "fmla v17.4s, v21.4s, v10.4s\n"
-    "ldr s26, [x9, x26]\n"
-    "fmla v2.4s, v29.4s, v14.4s\n"
-    "ldr s20, [x17, x27]\n"
-    "fmla v16.4s, v28.4s, v10.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v17.4s, v23.4s, v7.4s\n"
-    "ldr s27, [x16, x28]\n"
-    "fmla v15.4s, v24.4s, v8.4s\n"
-    "ldr s30, [x15, x13]\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "ldr s24, [%[inptr0], x14]\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v5.4s, v25.4s, v12.4s\n"
-    "fmla v15.4s, v25.4s, v10.4s\n"
-    "ldr s28, [x20, %[input_col_stride1]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "ldr s17, [x19, x26]\n"
-    "fmla v5.4s, v19.4s, v14.4s\n"
-    "ldr s18, [x9, x27]\n"
-    "fmla v16.4s, v26.4s, v7.4s\n"
-    "ldr s25, [x17, x28]\n"
-    "fmla v2.4s, v22.4s, v11.4s\n"
-    "ldr s22, [x16, x13]\n"
-    "fmla v4.4s, v26.4s, v9.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "str s16, [x21]\n"
-    "fmla v1.4s, v26.4s, v6.4s\n"
-    "fmla v2.4s, v26.4s, v13.4s\n"
-    "ldr s21, [x15, x14]\n"
-    "fmla v4.4s, v20.4s, v11.4s\n"
-    "ldr s23, [x20, x26]\n"
-    "fmla v15.4s, v27.4s, v7.4s\n"
-    "ldr s19, [x19, x27]\n"
-    "fmla v5.4s, v27.4s, v9.4s\n"
-    "add x15, x15, #4\n"
-    "fmla v4.4s, v27.4s, v13.4s\n"
-    "fmla v3.4s, v27.4s, v6.4s\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v2.4s, v28.4s, v8.4s\n"
-    "fmla v5.4s, v30.4s, v11.4s\n"
-    "ldr s29, [x9, x28]\n"
-    "fmla v1.4s, v17.4s, v12.4s\n"
-    "ldr s27, [x17, x13]\n"
-    "fmla v2.4s, v17.4s, v10.4s\n"
-    "ldr s28, [x16, x14]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "ldr s26, [x20, x27]\n"
-    "fmla v4.4s, v18.4s, v8.4s\n"
-    "ldr s20, [x19, x28]\n"
-    "fmla v1.4s, v18.4s, v14.4s\n"
-    "ldr s17, [x9, x13]\n"
-    "fmla v3.4s, v25.4s, v12.4s\n"
-    "ldr s18, [x17, x14]\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "ldr s16, [x20, x28]\n"
-    "fmla v5.4s, v22.4s, v8.4s\n"
-    "add x16, x16, #4\n"
-    "fmla v3.4s, v22.4s, v14.4s\n"
-    "ldr s15, [x19, x13]\n"
-    "fmla v2.4s, v23.4s, v7.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v5.4s, v21.4s, v10.4s\n"
-    "ldr s21, [x9, x14]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr s23, [x20, x13]\n"
-    "str s2, [x22]\n"
-    "fmla v4.4s, v29.4s, v7.4s\n"
-    "fmla v3.4s, v29.4s, v9.4s\n"
-    "ldr s24, [x19, x14]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "ldr s25, [x20, x14]\n"
-    "str s4, [x21, %[output_col_stride1]]\n"
-    "fmla v0.4s, v29.4s, v6.4s\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "ldr s27, [%[wbptr]]\n"
-    "fmla v1.4s, v29.4s, v13.4s\n"
-    "ldr s29, [%[inptr0]]\n"
-    "fmla v5.4s, v28.4s, v7.4s\n"
-    "ldr s6, [%[wbptr], #4]\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "ldr s28, [x15]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
-    "str s5, [%[outptr0], x23]\n"
-    "fmla v0.4s, v20.4s, v12.4s\n"
-    "fmla v3.4s, v17.4s, v8.4s\n"
-    "ldr s22, [x16]\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "ldr s20, [x15, %[input_col_stride1]]\n"
-    "fmla v0.4s, v17.4s, v14.4s\n"
-    "ldr s12, [%[wbptr], #16]\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "ldr s19, [%[inptr0], x26]\n"
-    "fmla v1.4s, v16.4s, v7.4s\n"
-    "ldr s30, [x17]\n"
-    "fmla v0.4s, v16.4s, v9.4s\n"
-    "ldr s14, [%[wbptr], #8]\n"
-    "fmla v3.4s, v21.4s, v7.4s\n"
-    "ldr s18, [x16, %[input_col_stride1]]\n"
-    "str s1, [x22, %[output_col_stride1]]\n"
-    "mov v17.16b, v27.16b\n"
-    "fmla v0.4s, v15.4s, v11.4s\n"
-    "ldr s9, [%[wbptr], #28]\n"
-    "str s3, [x21, x23]\n"
-    "mov v16.16b, v27.16b\n"
-    "mov v15.16b, v27.16b\n"
-    "add x9, x9, #4\n"
-    "fmla v0.4s, v21.4s, v13.4s\n"
-    "ldr s11, [%[wbptr], #20]\n"
-    "mov v2.16b, v27.16b\n"
-    "add x19, x19, #4\n"
-    "mov v4.16b, v27.16b\n"
-    "add x20, x20, #4\n"
-    "fmla v0.4s, v23.4s, v8.4s\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "mov v5.16b, v27.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "mov v1.16b, v27.16b\n"
-    "add x21, x21, #4\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "ldr s8, [%[wbptr], #32]\n"
-    "mov v3.16b, v27.16b\n"
-    "fmla v0.4s, v25.4s, v7.4s\n"
-    "ldr s10, [%[wbptr], #24]\n"
-    "str s0, [x22, x23]\n"
-    "mov v0.16b, v27.16b\n"
-    "ldr s7, [%[wbptr], #36]\n"
-    "add x22, x22, #4\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v17.4s, v29.4s, v6.4s\n"
-    "ldr s21, [x15, x26]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "ldr s27, [%[inptr0], x27]\n"
-    "fmla v15.4s, v19.4s, v6.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v28.4s, v12.4s\n"
-    "ldr s25, [x9]\n"
-    "fmla v16.4s, v30.4s, v12.4s\n"
-    "ldr s24, [x17, %[input_col_stride1]]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v17.4s, v26.4s, v14.4s\n"
-    "ldr s23, [x16, x26]\n"
-    "fmla v16.4s, v18.4s, v14.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v27.4s, v14.4s\n"
-    "ldr s26, [x15, x27]\n"
-    "fmla v17.4s, v22.4s, v9.4s\n"
-    "ldr s22, [%[inptr0], x28]\n"
-    "fmla v16.4s, v25.4s, v9.4s\n"
-    "ldr s30, [x19]\n"
-    "fmla v15.4s, v23.4s, v9.4s\n"
-    "fmla v4.4s, v23.4s, v6.4s\n"
-    "fmla v17.4s, v20.4s, v11.4s\n"
-    "ldr s29, [x9, %[input_col_stride1]]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "ldr s28, [x17, x26]\n"
-    "fmla v15.4s, v26.4s, v11.4s\n"
-    "ldr s24, [x16, x27]\n"
-    "fmla v17.4s, v19.4s, v13.4s\n"
-    "ldr s25, [x15, x28]\n"
-    "fmla v16.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "ldr s19, [%[inptr0], x13]\n"
-    "fmla v17.4s, v18.4s, v8.4s\n"
-    "ldr s18, [x20]\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr s22, [x19, %[input_col_stride1]]\n"
-    "fmla v16.4s, v29.4s, v8.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v17.4s, v21.4s, v10.4s\n"
-    "ldr s26, [x9, x26]\n"
-    "fmla v2.4s, v29.4s, v14.4s\n"
-    "ldr s20, [x17, x27]\n"
-    "fmla v16.4s, v28.4s, v10.4s\n"
-    "ldr s27, [x16, x28]\n"
-    "fmla v17.4s, v23.4s, v7.4s\n"
-    "ldr s30, [x15, x13]\n"
-    "fmla v15.4s, v24.4s, v8.4s\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "fmla v5.4s, v25.4s, v12.4s\n"
-    "ldr s24, [%[inptr0], x14]\n"
-    "str s17, [%[outptr0]]\n"
-    "fmla v2.4s, v18.4s, v9.4s\n"
-    "fmla v15.4s, v25.4s, v10.4s\n"
-    "ldr s28, [x20, %[input_col_stride1]]\n"
-    "fmla v5.4s, v19.4s, v14.4s\n"
-    "ldr s17, [x19, x26]\n"
-    "fmla v2.4s, v22.4s, v11.4s\n"
-    "ldr s18, [x9, x27]\n"
-    "fmla v16.4s, v26.4s, v7.4s\n"
-    "ldr s25, [x17, x28]\n"
-    "fmla v4.4s, v26.4s, v9.4s\n"
-    "ldr s22, [x16, x13]\n"
-    "fmla v2.4s, v26.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "str s16, [x21]\n"
-    "fmla v1.4s, v26.4s, v6.4s\n"
-    "fmla v4.4s, v20.4s, v11.4s\n"
-    "ldr s21, [x15, x14]\n"
-    "fmla v15.4s, v27.4s, v7.4s\n"
-    "ldr s23, [x20, x26]\n"
-    "fmla v5.4s, v27.4s, v9.4s\n"
-    "ldr s19, [x19, x27]\n"
-    "fmla v4.4s, v27.4s, v13.4s\n"
-    "add x15, x15, #4\n"
-    "str s15, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v3.4s, v27.4s, v6.4s\n"
-    "fmla v5.4s, v30.4s, v11.4s\n"
-    "ldr s29, [x9, x28]\n"
-    "fmla v2.4s, v28.4s, v8.4s\n"
-    "ldr s27, [x17, x13]\n"
-    "fmla v1.4s, v17.4s, v12.4s\n"
-    "ldr s28, [x16, x14]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "ldr s26, [x20, x27]\n"
-    "fmla v2.4s, v17.4s, v10.4s\n"
-    "ldr s20, [x19, x28]\n"
-    "fmla v4.4s, v18.4s, v8.4s\n"
-    "ldr s17, [x9, x13]\n"
-    "fmla v1.4s, v18.4s, v14.4s\n"
-    "ldr s18, [x17, x14]\n"
-    "fmla v3.4s, v25.4s, v12.4s\n"
-    "add x16, x16, #4\n"
-    "fmla v4.4s, v25.4s, v10.4s\n"
-    "ldr s16, [x20, x28]\n"
-    "fmla v5.4s, v22.4s, v8.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v3.4s, v22.4s, v14.4s\n"
-    "ldr s15, [x19, x13]\n"
-    "fmla v2.4s, v23.4s, v7.4s\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "fmla v5.4s, v21.4s, v10.4s\n"
-    "ldr s21, [x9, x14]\n"
-    "fmla v4.4s, v29.4s, v7.4s\n"
-    "ldr s23, [x20, x13]\n"
-    "str s2, [x22]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v3.4s, v29.4s, v9.4s\n"
-    "ldr s24, [x19, x14]\n"
-    "str s4, [x21, %[output_col_stride1]]\n"
-    "fmla v0.4s, v29.4s, v6.4s\n"
-    "fmla v1.4s, v29.4s, v13.4s\n"
-    "ldr s25, [x20, x14]\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v5.4s, v28.4s, v7.4s\n"
-    "add x19, x19, #4\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "add x20, x20, #4\n"
-    "fmla v3.4s, v28.4s, v13.4s\n"
-    "fmla v0.4s, v20.4s, v12.4s\n"
-    "str s5, [%[outptr0], x23]\n"
-    "fmla v1.4s, v20.4s, v10.4s\n"
-    "fmla v3.4s, v17.4s, v8.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v0.4s, v17.4s, v14.4s\n"
-    "fmla v1.4s, v16.4s, v7.4s\n"
-    "fmla v3.4s, v18.4s, v10.4s\n"
-    "fmla v0.4s, v16.4s, v9.4s\n"
-    "str s1, [x22, %[output_col_stride1]]\n"
-    "fmla v3.4s, v21.4s, v7.4s\n"
-    "fmla v0.4s, v15.4s, v11.4s\n"
-    "str s3, [x21, x23]\n"
-    "fmla v0.4s, v21.4s, v13.4s\n"
-    "add x21, x21, #4\n"
-    "fmla v0.4s, v23.4s, v8.4s\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v0.4s, v25.4s, v7.4s\n"
-    "str s0, [x22, x23]\n"
-    "add x22, x22, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output)
-    : [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory"
-  );
-}
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
deleted file mode 100644
index 89d1f22..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ /dev/null

@@ -1,6018 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp32_fp32.hpp"
-
-namespace depthwise
-{
-
-using namespace neon_convolution_kernels;
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-
-#ifdef __aarch64__
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x8, %[inptr0], %[input_row_stride]\n"
-    "add x15, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x23, %[outptr0], %[output_row_stride]\n"
-    "add x9, x8, %[input_row_stride]\n"
-    "add x16, x15, #64\n"
-    "add x17, x15, %[input_col_stride1]\n"
-    "add x10, x9, %[input_row_stride]\n"
-    "add x7, x17, #64\n"
-    "add x19, x17, %[input_col_stride1]\n"
-    "add x11, x10, %[input_row_stride]\n"
-    "add x20, x19, #64\n"
-    "add x21, x19, %[input_col_stride1]\n"
-    "add x12, x11, %[input_row_stride]\n"
-    "add x22, x21, #64\n"
-    "add x24, x23, %[output_row_stride]\n"
-    "add x25, x24, %[output_row_stride]\n"
-    "add x26, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x13, %[n_channels], #3\n"
-    "add x27, x26, %[output_col_stride1]\n"
-    "lsr x14, %[n_channels], #2\n"
-    "cbz x14, 4f\n"
-    "1:\n"
-    "ldr q14, [%[wbptr]]\n"
-    "subs x14, x14, #1\n"
-    "mov v17.16b, v14.16b\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v23.16b, v14.16b\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "mov v24.16b, v14.16b\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "mov v20.16b, v14.16b\n"
-    "ldr q9, [%[wbptr], #64]\n"
-    "mov v16.16b, v14.16b\n"
-    "ldr q8, [%[wbptr], #80]\n"
-    "mov v13.16b, v14.16b\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr q6, [%[wbptr], #112]\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr q5, [%[wbptr], #128]\n"
-    "mov v2.16b, v14.16b\n"
-    "ldr q4, [%[wbptr], #144]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr q29, [%[inptr0]]\n"
-    "fmla v17.4s, v29.4s, v12.4s\n"
-    "ldr q28, [x8]\n"
-    "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
-    "ldr q25, [x9]\n"
-    "ldr q26, [x8, %[input_col_stride1]]\n"
-    "ldr q27, [%[inptr0], x15]\n"
-    "ldr q15, [x10]\n"
-    "ldr q18, [x9, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x8, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "prfm pldl1keep, [x8, x28]\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "prfm pldl1keep, [x9, x28]\n"
-    "beq 3f\n"
-    "2:\n"
-    "fmla v17.4s, v28.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x16]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr q22, [x8, x15]\n"
-    "fmla v24.4s, v30.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "ldr q29, [%[inptr0], x17]\n"
-    "fmla v23.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x11, #64]\n"
-    "fmla v20.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x10, x28]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "ldr q25, [x11]\n"
-    "fmla v23.4s, v26.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x7]\n"
-    "fmla v17.4s, v26.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v16.4s, v26.4s, v12.4s\n"
-    "ldr q28, [x10, %[input_col_stride1]]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, #64]\n"
-    "fmla v17.4s, v27.4s, v10.4s\n"
-    "prfm pldl1keep, [x11, x28]\n"
-    "fmla v13.4s, v27.4s, v12.4s\n"
-    "ldr q19, [x9, x15]\n"
-    "fmla v23.4s, v15.4s, v6.4s\n"
-    "prfm pldl1keep, [x10, x16]\n"
-    "fmla v20.4s, v15.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v0.4s, v15.4s, v12.4s\n"
-    "ldr q21, [x8, x17]\n"
-    "fmla v17.4s, v18.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x20]\n"
-    "fmla v23.4s, v18.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v24.4s, v18.4s, v6.4s\n"
-    "prfm pldl1keep, [x12, x28]\n"
-    "fmla v20.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x16]\n"
-    "fmla v16.4s, v18.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x7]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "ldr q27, [%[inptr0], x19]\n"
-    "fmla v17.4s, v22.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x20]\n"
-    "fmla v23.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x8, x22]\n"
-    "fmla v24.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x16]\n"
-    "fmla v16.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x7]\n"
-    "fmla v13.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x20]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr q18, [x12]\n"
-    "fmla v24.4s, v29.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x22]\n"
-    "fmla v13.4s, v29.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x7]\n"
-    "fmla v3.4s, v29.4s, v12.4s\n"
-    "ldr q22, [x11, %[input_col_stride1]]\n"
-    "fmla v20.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x20]\n"
-    "fmla v0.4s, v25.4s, v9.4s\n"
-    "ldr q25, [x10, x15]\n"
-    "fmla v23.4s, v28.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, x22]\n"
-    "fmla v20.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x20]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x22]\n"
-    "fmla v0.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x22]\n"
-    "fmla v1.4s, v28.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v23.4s, v19.4s, v7.4s\n"
-    "subs x14, x14, #1\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v10.4s\n"
-    "str q17, [%[outptr0]]\n"
-    "mov v15.16b, v14.16b\n"
-    "fmla v16.4s, v19.4s, v8.4s\n"
-    "fmla v13.4s, v19.4s, v6.4s\n"
-    "fmla v15.4s, v28.4s, v12.4s\n"
-    "ldr q29, [x9, x17]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v2.4s, v19.4s, v9.4s\n"
-    "fmla v24.4s, v21.4s, v7.4s\n"
-    "fmla v16.4s, v21.4s, v10.4s\n"
-    "fmla v13.4s, v21.4s, v8.4s\n"
-    "fmla v3.4s, v21.4s, v9.4s\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v18.16b, v14.16b\n"
-    "fmla v20.4s, v22.4s, v5.4s\n"
-    "fmla v13.4s, v27.4s, v10.4s\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "mov v17.16b, v14.16b\n"
-    "fmla v18.4s, v19.4s, v12.4s\n"
-    "mov v19.16b, v14.16b\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "fmla v17.4s, v21.4s, v12.4s\n"
-    "ldr q26, [x8, x19]\n"
-    "fmla v1.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v9.4s\n"
-    "mov v22.16b, v14.16b\n"
-    "mov v21.16b, v14.16b\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v20.4s, v25.4s, v7.4s\n"
-    "fmla v16.4s, v25.4s, v5.4s\n"
-    "fmla v0.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "str q23, [x23]\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "ldr q28, [%[inptr0], x21]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr q30, [x12, %[input_col_stride1]]\n"
-    "fmla v24.4s, v29.4s, v4.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v16.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "str q24, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v1.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "ldr q27, [x11, x15]\n"
-    "fmla v3.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "fmla v22.4s, v29.4s, v12.4s\n"
-    "ldr q23, [x10, x17]\n"
-    "fmla v13.4s, v26.4s, v7.4s\n"
-    "fmla v2.4s, v26.4s, v10.4s\n"
-    "fmla v3.4s, v26.4s, v8.4s\n"
-    "fmla v17.4s, v26.4s, v11.4s\n"
-    "fmla v0.4s, v30.4s, v5.4s\n"
-    "ldr q24, [x9, x19]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "ldr q29, [x8, x21]\n"
-    "fmla v3.4s, v28.4s, v10.4s\n"
-    "ldr q14, [x12, x15]\n"
-    "fmla v20.4s, v27.4s, v4.4s\n"
-    "add x8, x8, #16\n"
-    "fmla v0.4s, v27.4s, v7.4s\n"
-    "prfm pldl1keep, [x8, #64]\n"
-    "fmla v1.4s, v27.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x28]\n"
-    "str q20, [x24]\n"
-    "fmla v15.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v6.4s\n"
-    "ldr q25, [x11, x17]\n"
-    "fmla v19.4s, v27.4s, v9.4s\n"
-    "ldr q30, [x10, x19]\n"
-    "fmla v16.4s, v23.4s, v4.4s\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "fmla v2.4s, v23.4s, v5.4s\n"
-    "fmla v15.4s, v23.4s, v10.4s\n"
-    "fmla v18.4s, v23.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v6.4s\n"
-    "str q16, [x23, %[output_col_stride1]]\n"
-    "fmla v19.4s, v23.4s, v11.4s\n"
-    "fmla v22.4s, v23.4s, v9.4s\n"
-    "ldr q26, [x9, x21]\n"
-    "fmla v21.4s, v23.4s, v12.4s\n"
-    "ldr q27, [x12, x17]\n"
-    "fmla v13.4s, v24.4s, v4.4s\n"
-    "ldr q20, [x11, x19]\n"
-    "fmla v2.4s, v24.4s, v7.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v3.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "str q13, [%[outptr0], x26]\n"
-    "fmla v18.4s, v24.4s, v10.4s\n"
-    "fmla v17.4s, v24.4s, v8.4s\n"
-    "ldr q23, [x10, x21]\n"
-    "fmla v22.4s, v24.4s, v11.4s\n"
-    "ldr q24, [x12, x19]\n"
-    "fmla v3.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x28]\n"
-    "fmla v17.4s, v29.4s, v10.4s\n"
-    "ldr q16, [x11, x21]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "add x10, x10, #16\n"
-    "fmla v15.4s, v14.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v19.4s, v14.4s, v6.4s\n"
-    "ldr q13, [x12, x21]\n"
-    "str q0, [x25]\n"
-    "fmla v1.4s, v25.4s, v4.4s\n"
-    "fmla v15.4s, v25.4s, v7.4s\n"
-    "ldr q14, [%[wbptr]]\n"
-    "fmla v18.4s, v25.4s, v5.4s\n"
-    "add x11, x11, #16\n"
-    "str q1, [x24, %[output_col_stride1]]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "fmla v22.4s, v25.4s, v6.4s\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "fmla v21.4s, v25.4s, v9.4s\n"
-    "ldr q29, [%[inptr0]]\n"
-    "fmla v2.4s, v30.4s, v4.4s\n"
-    "ldr q28, [x8]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "add x12, x12, #16\n"
-    "fmla v17.4s, v30.4s, v5.4s\n"
-    "fmla v19.4s, v30.4s, v10.4s\n"
-    "str q2, [x23, x26]\n"
-    "fmla v22.4s, v30.4s, v8.4s\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr q9, [%[wbptr], #64]\n"
-    "fmla v3.4s, v26.4s, v4.4s\n"
-    "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v17.4s, v26.4s, v7.4s\n"
-    "ldr q25, [x9]\n"
-    "fmla v22.4s, v26.4s, v10.4s\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "str q3, [%[outptr0], x27]\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v19.4s, v27.4s, v5.4s\n"
-    "ldr q26, [x8, %[input_col_stride1]]\n"
-    "fmla v21.4s, v27.4s, v6.4s\n"
-    "ldr q27, [%[inptr0], x15]\n"
-    "str q15, [x25, %[output_col_stride1]]\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v19.4s, v20.4s, v7.4s\n"
-    "ldr q15, [x10]\n"
-    "fmla v22.4s, v20.4s, v5.4s\n"
-    "ldr q6, [%[wbptr], #112]\n"
-    "str q18, [x24, x26]\n"
-    "fmla v21.4s, v20.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "ldr q18, [x9, %[input_col_stride1]]\n"
-    "fmla v22.4s, v23.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v21.4s, v23.4s, v10.4s\n"
-    "ldr q8, [%[wbptr], #80]\n"
-    "str q17, [x23, x27]\n"
-    "fmla v19.4s, v24.4s, v4.4s\n"
-    "fmla v22.4s, v16.4s, v4.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v21.4s, v24.4s, v5.4s\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "str q19, [x25, x26]\n"
-    "mov v17.16b, v14.16b\n"
-    "str q22, [x24, x27]\n"
-    "mov v23.16b, v14.16b\n"
-    "fmla v21.4s, v16.4s, v7.4s\n"
-    "ldr q5, [%[wbptr], #128]\n"
-    "mov v24.16b, v14.16b\n"
-    "add x24, x24, #16\n"
-    "mov v20.16b, v14.16b\n"
-    "mov v16.16b, v14.16b\n"
-    "fmla v21.4s, v13.4s, v4.4s\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "mov v13.16b, v14.16b\n"
-    "mov v0.16b, v14.16b\n"
-    "mov v1.16b, v14.16b\n"
-    "mov v2.16b, v14.16b\n"
-    "str q21, [x25, x27]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr q4, [%[wbptr], #144]\n"
-    "add x25, x25, #16\n"
-    "fmla v17.4s, v29.4s, v12.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "fmla v17.4s, v28.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x16]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr q22, [x8, x15]\n"
-    "fmla v24.4s, v30.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "ldr q29, [%[inptr0], x17]\n"
-    "fmla v23.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x11, #64]\n"
-    "fmla v20.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x10, x28]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "ldr q25, [x11]\n"
-    "fmla v23.4s, v26.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x7]\n"
-    "fmla v17.4s, v26.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v16.4s, v26.4s, v12.4s\n"
-    "ldr q28, [x10, %[input_col_stride1]]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, #64]\n"
-    "fmla v17.4s, v27.4s, v10.4s\n"
-    "prfm pldl1keep, [x11, x28]\n"
-    "fmla v13.4s, v27.4s, v12.4s\n"
-    "ldr q19, [x9, x15]\n"
-    "fmla v23.4s, v15.4s, v6.4s\n"
-    "prfm pldl1keep, [x10, x16]\n"
-    "fmla v20.4s, v15.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v0.4s, v15.4s, v12.4s\n"
-    "ldr q21, [x8, x17]\n"
-    "fmla v17.4s, v18.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x20]\n"
-    "fmla v23.4s, v18.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v24.4s, v18.4s, v6.4s\n"
-    "prfm pldl1keep, [x12, x28]\n"
-    "fmla v20.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x16]\n"
-    "fmla v16.4s, v18.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x7]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "ldr q27, [%[inptr0], x19]\n"
-    "fmla v17.4s, v22.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x20]\n"
-    "fmla v23.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x8, x22]\n"
-    "fmla v24.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x16]\n"
-    "fmla v16.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x7]\n"
-    "fmla v13.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x20]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr q18, [x12]\n"
-    "fmla v24.4s, v29.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x22]\n"
-    "fmla v13.4s, v29.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x7]\n"
-    "fmla v3.4s, v29.4s, v12.4s\n"
-    "ldr q22, [x11, %[input_col_stride1]]\n"
-    "fmla v20.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x20]\n"
-    "fmla v0.4s, v25.4s, v9.4s\n"
-    "ldr q25, [x10, x15]\n"
-    "fmla v23.4s, v28.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, x22]\n"
-    "fmla v20.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x20]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x22]\n"
-    "fmla v0.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x22]\n"
-    "fmla v1.4s, v28.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v17.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v23.4s, v19.4s, v7.4s\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v10.4s\n"
-    "fmla v16.4s, v19.4s, v8.4s\n"
-    "str q17, [%[outptr0]]\n"
-    "mov v15.16b, v14.16b\n"
-    "fmla v13.4s, v19.4s, v6.4s\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v15.4s, v28.4s, v12.4s\n"
-    "ldr q29, [x9, x17]\n"
-    "fmla v2.4s, v19.4s, v9.4s\n"
-    "fmla v24.4s, v21.4s, v7.4s\n"
-    "fmla v16.4s, v21.4s, v10.4s\n"
-    "fmla v13.4s, v21.4s, v8.4s\n"
-    "fmla v3.4s, v21.4s, v9.4s\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v18.16b, v14.16b\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "fmla v13.4s, v27.4s, v10.4s\n"
-    "fmla v20.4s, v22.4s, v5.4s\n"
-    "fmla v18.4s, v19.4s, v12.4s\n"
-    "ldr q26, [x8, x19]\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "ldr q28, [%[inptr0], x21]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v1.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v9.4s\n"
-    "mov v17.16b, v14.16b\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v20.4s, v25.4s, v7.4s\n"
-    "fmla v16.4s, v25.4s, v5.4s\n"
-    "fmla v17.4s, v21.4s, v12.4s\n"
-    "ldr q30, [x12, %[input_col_stride1]]\n"
-    "str q23, [x23]\n"
-    "mov v19.16b, v14.16b\n"
-    "fmla v0.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "mov v22.16b, v14.16b\n"
-    "mov v21.16b, v14.16b\n"
-    "fmla v24.4s, v29.4s, v4.4s\n"
-    "fmla v16.4s, v29.4s, v7.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v1.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v3.4s, v29.4s, v6.4s\n"
-    "str q24, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "ldr q27, [x11, x15]\n"
-    "fmla v22.4s, v29.4s, v12.4s\n"
-    "ldr q23, [x10, x17]\n"
-    "fmla v13.4s, v26.4s, v7.4s\n"
-    "fmla v2.4s, v26.4s, v10.4s\n"
-    "fmla v3.4s, v26.4s, v8.4s\n"
-    "fmla v17.4s, v26.4s, v11.4s\n"
-    "fmla v0.4s, v30.4s, v5.4s\n"
-    "ldr q24, [x9, x19]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "ldr q29, [x8, x21]\n"
-    "fmla v3.4s, v28.4s, v10.4s\n"
-    "ldr q14, [x12, x15]\n"
-    "fmla v20.4s, v27.4s, v4.4s\n"
-    "add x8, x8, #16\n"
-    "fmla v0.4s, v27.4s, v7.4s\n"
-    "fmla v1.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v6.4s\n"
-    "str q20, [x24]\n"
-    "fmla v19.4s, v27.4s, v9.4s\n"
-    "fmla v16.4s, v23.4s, v4.4s\n"
-    "ldr q25, [x11, x17]\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "ldr q30, [x10, x19]\n"
-    "fmla v2.4s, v23.4s, v5.4s\n"
-    "fmla v15.4s, v23.4s, v10.4s\n"
-    "str q16, [x23, %[output_col_stride1]]\n"
-    "fmla v18.4s, v23.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v6.4s\n"
-    "ldr q26, [x9, x21]\n"
-    "fmla v19.4s, v23.4s, v11.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v22.4s, v23.4s, v9.4s\n"
-    "fmla v21.4s, v23.4s, v12.4s\n"
-    "fmla v13.4s, v24.4s, v4.4s\n"
-    "ldr q27, [x12, x17]\n"
-    "fmla v2.4s, v24.4s, v7.4s\n"
-    "ldr q20, [x11, x19]\n"
-    "fmla v3.4s, v24.4s, v5.4s\n"
-    "fmla v18.4s, v24.4s, v10.4s\n"
-    "str q13, [%[outptr0], x26]\n"
-    "fmla v17.4s, v24.4s, v8.4s\n"
-    "fmla v22.4s, v24.4s, v11.4s\n"
-    "ldr q23, [x10, x21]\n"
-    "fmla v3.4s, v29.4s, v7.4s\n"
-    "ldr q24, [x12, x19]\n"
-    "fmla v17.4s, v29.4s, v10.4s\n"
-    "ldr q16, [x11, x21]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "add x10, x10, #16\n"
-    "fmla v15.4s, v14.4s, v5.4s\n"
-    "add x11, x11, #16\n"
-    "fmla v19.4s, v14.4s, v6.4s\n"
-    "ldr q13, [x12, x21]\n"
-    "str q0, [x25]\n"
-    "fmla v1.4s, v25.4s, v4.4s\n"
-    "fmla v15.4s, v25.4s, v7.4s\n"
-    "add x12, x12, #16\n"
-    "fmla v18.4s, v25.4s, v5.4s\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "str q1, [x24, %[output_col_stride1]]\n"
-    "fmla v22.4s, v25.4s, v6.4s\n"
-    "fmla v21.4s, v25.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v4.4s\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "fmla v17.4s, v30.4s, v5.4s\n"
-    "fmla v19.4s, v30.4s, v10.4s\n"
-    "fmla v22.4s, v30.4s, v8.4s\n"
-    "str q2, [x23, x26]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "fmla v3.4s, v26.4s, v4.4s\n"
-    "fmla v17.4s, v26.4s, v7.4s\n"
-    "fmla v22.4s, v26.4s, v10.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v19.4s, v27.4s, v5.4s\n"
-    "fmla v21.4s, v27.4s, v6.4s\n"
-    "str q3, [%[outptr0], x27]\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "str q15, [x25, %[output_col_stride1]]\n"
-    "fmla v22.4s, v20.4s, v5.4s\n"
-    "fmla v19.4s, v20.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q18, [x24, x26]\n"
-    "fmla v21.4s, v20.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "fmla v22.4s, v23.4s, v7.4s\n"
-    "fmla v19.4s, v24.4s, v4.4s\n"
-    "fmla v21.4s, v23.4s, v10.4s\n"
-    "str q17, [x23, x27]\n"
-    "fmla v22.4s, v16.4s, v4.4s\n"
-    "str q19, [x25, x26]\n"
-    "add x23, x23, #16\n"
-    "fmla v21.4s, v24.4s, v5.4s\n"
-    "str q22, [x24, x27]\n"
-    "add x24, x24, #16\n"
-    "fmla v21.4s, v16.4s, v7.4s\n"
-    "fmla v21.4s, v13.4s, v4.4s\n"
-    "str q21, [x25, x27]\n"
-    "add x25, x25, #16\n"
-    "4:\n"
-    "cbz x13, 7f\n"
-    "ldr s14, [%[wbptr]]\n"
-    "mov v17.16b, v14.16b\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v23.16b, v14.16b\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "mov v24.16b, v14.16b\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "mov v20.16b, v14.16b\n"
-    "ldr s9, [%[wbptr], #16]\n"
-    "mov v16.16b, v14.16b\n"
-    "ldr s8, [%[wbptr], #20]\n"
-    "mov v13.16b, v14.16b\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "mov v0.16b, v14.16b\n"
-    "ldr s6, [%[wbptr], #28]\n"
-    "mov v1.16b, v14.16b\n"
-    "ldr s5, [%[wbptr], #32]\n"
-    "mov v2.16b, v14.16b\n"
-    "ldr s4, [%[wbptr], #36]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr s29, [%[inptr0]]\n"
-    "fmla v17.4s, v29.4s, v12.4s\n"
-    "ldr s28, [x8]\n"
-    "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
-    "subs x13, x13, #1\n"
-    "ldr s25, [x9]\n"
-    "ldr s26, [x8, %[input_col_stride1]]\n"
-    "ldr s27, [%[inptr0], x15]\n"
-    "ldr s15, [x10]\n"
-    "ldr s18, [x9, %[input_col_stride1]]\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x8, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "prfm pldl1keep, [x8, x28]\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "prfm pldl1keep, [x9, x28]\n"
-    "beq 6f\n"
-    "5:\n"
-    "fmla v17.4s, v28.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x16]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr s22, [x8, x15]\n"
-    "fmla v24.4s, v30.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "ldr s29, [%[inptr0], x17]\n"
-    "fmla v23.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x11, #64]\n"
-    "fmla v20.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x10, x28]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "ldr s25, [x11]\n"
-    "fmla v23.4s, v26.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x7]\n"
-    "fmla v17.4s, v26.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v16.4s, v26.4s, v12.4s\n"
-    "ldr s28, [x10, %[input_col_stride1]]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, #64]\n"
-    "fmla v17.4s, v27.4s, v10.4s\n"
-    "prfm pldl1keep, [x11, x28]\n"
-    "fmla v13.4s, v27.4s, v12.4s\n"
-    "ldr s19, [x9, x15]\n"
-    "fmla v23.4s, v15.4s, v6.4s\n"
-    "prfm pldl1keep, [x10, x16]\n"
-    "fmla v20.4s, v15.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v0.4s, v15.4s, v12.4s\n"
-    "ldr s21, [x8, x17]\n"
-    "fmla v17.4s, v18.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x20]\n"
-    "fmla v23.4s, v18.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v24.4s, v18.4s, v6.4s\n"
-    "prfm pldl1keep, [x12, x28]\n"
-    "fmla v20.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x16]\n"
-    "fmla v16.4s, v18.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x7]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "ldr s27, [%[inptr0], x19]\n"
-    "fmla v17.4s, v22.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x20]\n"
-    "fmla v23.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x8, x22]\n"
-    "fmla v24.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x16]\n"
-    "fmla v16.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x7]\n"
-    "fmla v13.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x20]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr s18, [x12]\n"
-    "fmla v24.4s, v29.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x22]\n"
-    "fmla v13.4s, v29.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x7]\n"
-    "fmla v3.4s, v29.4s, v12.4s\n"
-    "ldr s22, [x11, %[input_col_stride1]]\n"
-    "fmla v20.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x20]\n"
-    "fmla v0.4s, v25.4s, v9.4s\n"
-    "ldr s25, [x10, x15]\n"
-    "fmla v23.4s, v28.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, x22]\n"
-    "fmla v20.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x20]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x22]\n"
-    "fmla v0.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x22]\n"
-    "fmla v1.4s, v28.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v23.4s, v19.4s, v7.4s\n"
-    "subs x13, x13, #1\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v10.4s\n"
-    "str s17, [%[outptr0]]\n"
-    "mov v15.16b, v14.16b\n"
-    "fmla v16.4s, v19.4s, v8.4s\n"
-    "fmla v13.4s, v19.4s, v6.4s\n"
-    "fmla v15.4s, v28.4s, v12.4s\n"
-    "ldr s29, [x9, x17]\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v2.4s, v19.4s, v9.4s\n"
-    "fmla v24.4s, v21.4s, v7.4s\n"
-    "fmla v16.4s, v21.4s, v10.4s\n"
-    "fmla v13.4s, v21.4s, v8.4s\n"
-    "fmla v3.4s, v21.4s, v9.4s\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v18.16b, v14.16b\n"
-    "fmla v20.4s, v22.4s, v5.4s\n"
-    "fmla v13.4s, v27.4s, v10.4s\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "mov v17.16b, v14.16b\n"
-    "fmla v18.4s, v19.4s, v12.4s\n"
-    "mov v19.16b, v14.16b\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "fmla v17.4s, v21.4s, v12.4s\n"
-    "ldr s26, [x8, x19]\n"
-    "fmla v1.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v9.4s\n"
-    "mov v22.16b, v14.16b\n"
-    "mov v21.16b, v14.16b\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v20.4s, v25.4s, v7.4s\n"
-    "fmla v16.4s, v25.4s, v5.4s\n"
-    "fmla v0.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "str s23, [x23]\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "ldr s28, [%[inptr0], x21]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr s30, [x12, %[input_col_stride1]]\n"
-    "fmla v24.4s, v29.4s, v4.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v16.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x28]\n"
-    "str s24, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v1.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "ldr s27, [x11, x15]\n"
-    "fmla v3.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "fmla v22.4s, v29.4s, v12.4s\n"
-    "ldr s23, [x10, x17]\n"
-    "fmla v13.4s, v26.4s, v7.4s\n"
-    "fmla v2.4s, v26.4s, v10.4s\n"
-    "fmla v3.4s, v26.4s, v8.4s\n"
-    "fmla v17.4s, v26.4s, v11.4s\n"
-    "fmla v0.4s, v30.4s, v5.4s\n"
-    "ldr s24, [x9, x19]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "ldr s29, [x8, x21]\n"
-    "fmla v3.4s, v28.4s, v10.4s\n"
-    "ldr s14, [x12, x15]\n"
-    "fmla v20.4s, v27.4s, v4.4s\n"
-    "add x8, x8, #4\n"
-    "fmla v0.4s, v27.4s, v7.4s\n"
-    "prfm pldl1keep, [x8, #64]\n"
-    "fmla v1.4s, v27.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x28]\n"
-    "str s20, [x24]\n"
-    "fmla v15.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v6.4s\n"
-    "ldr s25, [x11, x17]\n"
-    "fmla v19.4s, v27.4s, v9.4s\n"
-    "ldr s30, [x10, x19]\n"
-    "fmla v16.4s, v23.4s, v4.4s\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "fmla v2.4s, v23.4s, v5.4s\n"
-    "fmla v15.4s, v23.4s, v10.4s\n"
-    "fmla v18.4s, v23.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v6.4s\n"
-    "str s16, [x23, %[output_col_stride1]]\n"
-    "fmla v19.4s, v23.4s, v11.4s\n"
-    "fmla v22.4s, v23.4s, v9.4s\n"
-    "ldr s26, [x9, x21]\n"
-    "fmla v21.4s, v23.4s, v12.4s\n"
-    "ldr s27, [x12, x17]\n"
-    "fmla v13.4s, v24.4s, v4.4s\n"
-    "ldr s20, [x11, x19]\n"
-    "fmla v2.4s, v24.4s, v7.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v3.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "str s13, [%[outptr0], x26]\n"
-    "fmla v18.4s, v24.4s, v10.4s\n"
-    "fmla v17.4s, v24.4s, v8.4s\n"
-    "ldr s23, [x10, x21]\n"
-    "fmla v22.4s, v24.4s, v11.4s\n"
-    "ldr s24, [x12, x19]\n"
-    "fmla v3.4s, v29.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x28]\n"
-    "fmla v17.4s, v29.4s, v10.4s\n"
-    "ldr s16, [x11, x21]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "add x10, x10, #4\n"
-    "fmla v15.4s, v14.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v19.4s, v14.4s, v6.4s\n"
-    "ldr s13, [x12, x21]\n"
-    "str s0, [x25]\n"
-    "fmla v1.4s, v25.4s, v4.4s\n"
-    "fmla v15.4s, v25.4s, v7.4s\n"
-    "ldr s14, [%[wbptr]]\n"
-    "fmla v18.4s, v25.4s, v5.4s\n"
-    "add x11, x11, #4\n"
-    "str s1, [x24, %[output_col_stride1]]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "fmla v22.4s, v25.4s, v6.4s\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "fmla v21.4s, v25.4s, v9.4s\n"
-    "ldr s29, [%[inptr0]]\n"
-    "fmla v2.4s, v30.4s, v4.4s\n"
-    "ldr s28, [x8]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "add x12, x12, #4\n"
-    "fmla v17.4s, v30.4s, v5.4s\n"
-    "fmla v19.4s, v30.4s, v10.4s\n"
-    "str s2, [x23, x26]\n"
-    "fmla v22.4s, v30.4s, v8.4s\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr s9, [%[wbptr], #16]\n"
-    "fmla v3.4s, v26.4s, v4.4s\n"
-    "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v17.4s, v26.4s, v7.4s\n"
-    "ldr s25, [x9]\n"
-    "fmla v22.4s, v26.4s, v10.4s\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "str s3, [%[outptr0], x27]\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v19.4s, v27.4s, v5.4s\n"
-    "ldr s26, [x8, %[input_col_stride1]]\n"
-    "fmla v21.4s, v27.4s, v6.4s\n"
-    "ldr s27, [%[inptr0], x15]\n"
-    "str s15, [x25, %[output_col_stride1]]\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v19.4s, v20.4s, v7.4s\n"
-    "ldr s15, [x10]\n"
-    "fmla v22.4s, v20.4s, v5.4s\n"
-    "ldr s6, [%[wbptr], #28]\n"
-    "str s18, [x24, x26]\n"
-    "fmla v21.4s, v20.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "ldr s18, [x9, %[input_col_stride1]]\n"
-    "fmla v22.4s, v23.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v21.4s, v23.4s, v10.4s\n"
-    "ldr s8, [%[wbptr], #20]\n"
-    "str s17, [x23, x27]\n"
-    "fmla v19.4s, v24.4s, v4.4s\n"
-    "fmla v22.4s, v16.4s, v4.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v21.4s, v24.4s, v5.4s\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "str s19, [x25, x26]\n"
-    "mov v17.16b, v14.16b\n"
-    "str s22, [x24, x27]\n"
-    "mov v23.16b, v14.16b\n"
-    "fmla v21.4s, v16.4s, v7.4s\n"
-    "ldr s5, [%[wbptr], #32]\n"
-    "mov v24.16b, v14.16b\n"
-    "add x24, x24, #4\n"
-    "mov v20.16b, v14.16b\n"
-    "mov v16.16b, v14.16b\n"
-    "fmla v21.4s, v13.4s, v4.4s\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "mov v13.16b, v14.16b\n"
-    "mov v0.16b, v14.16b\n"
-    "mov v1.16b, v14.16b\n"
-    "mov v2.16b, v14.16b\n"
-    "str s21, [x25, x27]\n"
-    "mov v3.16b, v14.16b\n"
-    "ldr s4, [%[wbptr], #36]\n"
-    "add x25, x25, #4\n"
-    "fmla v17.4s, v29.4s, v12.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "fmla v17.4s, v28.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x16]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr s22, [x8, x15]\n"
-    "fmla v24.4s, v30.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "ldr s29, [%[inptr0], x17]\n"
-    "fmla v23.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x11, #64]\n"
-    "fmla v20.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x10, x28]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "ldr s25, [x11]\n"
-    "fmla v23.4s, v26.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [x8, x7]\n"
-    "fmla v17.4s, v26.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x20]\n"
-    "fmla v16.4s, v26.4s, v12.4s\n"
-    "ldr s28, [x10, %[input_col_stride1]]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, #64]\n"
-    "fmla v17.4s, v27.4s, v10.4s\n"
-    "prfm pldl1keep, [x11, x28]\n"
-    "fmla v13.4s, v27.4s, v12.4s\n"
-    "ldr s19, [x9, x15]\n"
-    "fmla v23.4s, v15.4s, v6.4s\n"
-    "prfm pldl1keep, [x10, x16]\n"
-    "fmla v20.4s, v15.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v0.4s, v15.4s, v12.4s\n"
-    "ldr s21, [x8, x17]\n"
-    "fmla v17.4s, v18.4s, v5.4s\n"
-    "prfm pldl1keep, [x8, x20]\n"
-    "fmla v23.4s, v18.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x22]\n"
-    "fmla v24.4s, v18.4s, v6.4s\n"
-    "prfm pldl1keep, [x12, x28]\n"
-    "fmla v20.4s, v18.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x16]\n"
-    "fmla v16.4s, v18.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x7]\n"
-    "fmla v1.4s, v18.4s, v12.4s\n"
-    "ldr s27, [%[inptr0], x19]\n"
-    "fmla v17.4s, v22.4s, v7.4s\n"
-    "prfm pldl1keep, [x9, x20]\n"
-    "fmla v23.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x8, x22]\n"
-    "fmla v24.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x16]\n"
-    "fmla v16.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x11, x7]\n"
-    "fmla v13.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x20]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "ldr s18, [x12]\n"
-    "fmla v24.4s, v29.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x22]\n"
-    "fmla v13.4s, v29.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x7]\n"
-    "fmla v3.4s, v29.4s, v12.4s\n"
-    "ldr s22, [x11, %[input_col_stride1]]\n"
-    "fmla v20.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x20]\n"
-    "fmla v0.4s, v25.4s, v9.4s\n"
-    "ldr s25, [x10, x15]\n"
-    "fmla v23.4s, v28.4s, v5.4s\n"
-    "prfm pldl1keep, [x10, x22]\n"
-    "fmla v20.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x12, x20]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "prfm pldl1keep, [x11, x22]\n"
-    "fmla v0.4s, v28.4s, v11.4s\n"
-    "prfm pldl1keep, [x12, x22]\n"
-    "fmla v1.4s, v28.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v17.4s, v19.4s, v4.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v23.4s, v19.4s, v7.4s\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v10.4s\n"
-    "fmla v16.4s, v19.4s, v8.4s\n"
-    "str s17, [%[outptr0]]\n"
-    "mov v15.16b, v14.16b\n"
-    "fmla v13.4s, v19.4s, v6.4s\n"
-    "fmla v1.4s, v19.4s, v11.4s\n"
-    "fmla v15.4s, v28.4s, v12.4s\n"
-    "ldr s29, [x9, x17]\n"
-    "fmla v2.4s, v19.4s, v9.4s\n"
-    "fmla v24.4s, v21.4s, v7.4s\n"
-    "fmla v16.4s, v21.4s, v10.4s\n"
-    "fmla v13.4s, v21.4s, v8.4s\n"
-    "fmla v3.4s, v21.4s, v9.4s\n"
-    "fmla v0.4s, v18.4s, v6.4s\n"
-    "mov v18.16b, v14.16b\n"
-    "fmla v2.4s, v21.4s, v11.4s\n"
-    "fmla v13.4s, v27.4s, v10.4s\n"
-    "fmla v20.4s, v22.4s, v5.4s\n"
-    "fmla v18.4s, v19.4s, v12.4s\n"
-    "ldr s26, [x8, x19]\n"
-    "fmla v3.4s, v27.4s, v11.4s\n"
-    "ldr s28, [%[inptr0], x21]\n"
-    "fmla v0.4s, v22.4s, v8.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v1.4s, v22.4s, v6.4s\n"
-    "fmla v15.4s, v22.4s, v9.4s\n"
-    "mov v17.16b, v14.16b\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v20.4s, v25.4s, v7.4s\n"
-    "fmla v16.4s, v25.4s, v5.4s\n"
-    "fmla v17.4s, v21.4s, v12.4s\n"
-    "ldr s30, [x12, %[input_col_stride1]]\n"
-    "str s23, [x23]\n"
-    "mov v19.16b, v14.16b\n"
-    "fmla v0.4s, v25.4s, v10.4s\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "fmla v2.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "mov v22.16b, v14.16b\n"
-    "mov v21.16b, v14.16b\n"
-    "fmla v24.4s, v29.4s, v4.4s\n"
-    "fmla v16.4s, v29.4s, v7.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v1.4s, v29.4s, v10.4s\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v3.4s, v29.4s, v6.4s\n"
-    "str s24, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "ldr s27, [x11, x15]\n"
-    "fmla v22.4s, v29.4s, v12.4s\n"
-    "ldr s23, [x10, x17]\n"
-    "fmla v13.4s, v26.4s, v7.4s\n"
-    "fmla v2.4s, v26.4s, v10.4s\n"
-    "fmla v3.4s, v26.4s, v8.4s\n"
-    "fmla v17.4s, v26.4s, v11.4s\n"
-    "fmla v0.4s, v30.4s, v5.4s\n"
-    "ldr s24, [x9, x19]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "ldr s29, [x8, x21]\n"
-    "fmla v3.4s, v28.4s, v10.4s\n"
-    "ldr s14, [x12, x15]\n"
-    "fmla v20.4s, v27.4s, v4.4s\n"
-    "add x8, x8, #4\n"
-    "fmla v0.4s, v27.4s, v7.4s\n"
-    "fmla v1.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v6.4s\n"
-    "str s20, [x24]\n"
-    "fmla v19.4s, v27.4s, v9.4s\n"
-    "fmla v16.4s, v23.4s, v4.4s\n"
-    "ldr s25, [x11, x17]\n"
-    "fmla v1.4s, v23.4s, v7.4s\n"
-    "ldr s30, [x10, x19]\n"
-    "fmla v2.4s, v23.4s, v5.4s\n"
-    "fmla v15.4s, v23.4s, v10.4s\n"
-    "str s16, [x23, %[output_col_stride1]]\n"
-    "fmla v18.4s, v23.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v6.4s\n"
-    "ldr s26, [x9, x21]\n"
-    "fmla v19.4s, v23.4s, v11.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v22.4s, v23.4s, v9.4s\n"
-    "fmla v21.4s, v23.4s, v12.4s\n"
-    "fmla v13.4s, v24.4s, v4.4s\n"
-    "ldr s27, [x12, x17]\n"
-    "fmla v2.4s, v24.4s, v7.4s\n"
-    "ldr s20, [x11, x19]\n"
-    "fmla v3.4s, v24.4s, v5.4s\n"
-    "fmla v18.4s, v24.4s, v10.4s\n"
-    "str s13, [%[outptr0], x26]\n"
-    "fmla v17.4s, v24.4s, v8.4s\n"
-    "fmla v22.4s, v24.4s, v11.4s\n"
-    "ldr s23, [x10, x21]\n"
-    "fmla v3.4s, v29.4s, v7.4s\n"
-    "ldr s24, [x12, x19]\n"
-    "fmla v17.4s, v29.4s, v10.4s\n"
-    "ldr s16, [x11, x21]\n"
-    "fmla v0.4s, v14.4s, v4.4s\n"
-    "add x10, x10, #4\n"
-    "fmla v15.4s, v14.4s, v5.4s\n"
-    "add x11, x11, #4\n"
-    "fmla v19.4s, v14.4s, v6.4s\n"
-    "ldr s13, [x12, x21]\n"
-    "str s0, [x25]\n"
-    "fmla v1.4s, v25.4s, v4.4s\n"
-    "fmla v15.4s, v25.4s, v7.4s\n"
-    "add x12, x12, #4\n"
-    "fmla v18.4s, v25.4s, v5.4s\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "str s1, [x24, %[output_col_stride1]]\n"
-    "fmla v22.4s, v25.4s, v6.4s\n"
-    "fmla v21.4s, v25.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v4.4s\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "fmla v17.4s, v30.4s, v5.4s\n"
-    "fmla v19.4s, v30.4s, v10.4s\n"
-    "fmla v22.4s, v30.4s, v8.4s\n"
-    "str s2, [x23, x26]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "fmla v3.4s, v26.4s, v4.4s\n"
-    "fmla v17.4s, v26.4s, v7.4s\n"
-    "fmla v22.4s, v26.4s, v10.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v19.4s, v27.4s, v5.4s\n"
-    "fmla v21.4s, v27.4s, v6.4s\n"
-    "str s3, [%[outptr0], x27]\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "str s15, [x25, %[output_col_stride1]]\n"
-    "fmla v22.4s, v20.4s, v5.4s\n"
-    "fmla v19.4s, v20.4s, v7.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s18, [x24, x26]\n"
-    "fmla v21.4s, v20.4s, v8.4s\n"
-    "fmla v17.4s, v23.4s, v4.4s\n"
-    "fmla v22.4s, v23.4s, v7.4s\n"
-    "fmla v19.4s, v24.4s, v4.4s\n"
-    "fmla v21.4s, v23.4s, v10.4s\n"
-    "str s17, [x23, x27]\n"
-    "fmla v22.4s, v16.4s, v4.4s\n"
-    "str s19, [x25, x26]\n"
-    "add x23, x23, #4\n"
-    "fmla v21.4s, v24.4s, v5.4s\n"
-    "str s22, [x24, x27]\n"
-    "add x24, x24, #4\n"
-    "fmla v21.4s, v16.4s, v7.4s\n"
-    "fmla v21.4s, v13.4s, v4.4s\n"
-    "str s21, [x25, x27]\n"
-    "add x25, x25, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::None>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[6][6],
-  float *outptrs[4][4]
-)
-{
-  __asm __volatile(
-    "mov x27, xzr\n"
-    "mov x28, xzr\n"
-    "and x15, %[n_channels], #3\n"
-    "lsr x16, %[n_channels], #2\n"
-    "cbz x16, 4f\n"
-    "1:\n"
-    "ldr q13, [%[wbptr]]\n"
-    "ldr x17, [%[inptrs], 0]\n"
-    "mov v18.16b, v13.16b\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "mov v22.16b, v13.16b\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "mov v23.16b, v13.16b\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "mov v19.16b, v13.16b\n"
-    "ldr q9, [%[wbptr], #64]\n"
-    "mov v17.16b, v13.16b\n"
-    "ldr q8, [%[wbptr], #80]\n"
-    "mov v14.16b, v13.16b\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "mov v0.16b, v13.16b\n"
-    "ldr q6, [%[wbptr], #112]\n"
-    "mov v1.16b, v13.16b\n"
-    "ldr q5, [%[wbptr], #128]\n"
-    "mov v2.16b, v13.16b\n"
-    "ldr q4, [%[wbptr], #144]\n"
-    "ldr q29, [x17, x27]\n"
-    "ldr x7, [%[inptrs], 48]\n"
-    "fmla v18.4s, v29.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 8]\n"
-    "ldr q27, [x7, x27]\n"
-    "ldr x19, [%[inptrs], 96]\n"
-    "ldr q28, [x17, x27]\n"
-    "ldr x7, [%[inptrs], 56]\n"
-    "ldr q25, [x19, x27]\n"
-    "ldr x17, [%[inptrs], 16]\n"
-    "ldr q16, [x7, x27]\n"
-    "ldr x20, [%[inptrs], 144]\n"
-    "ldr q15, [x17, x27]\n"
-    "ldr x19, [%[inptrs], 104]\n"
-    "ldr q21, [x20, x27]\n"
-    "subs x16, x16, #1\n"
-    "ldr q29, [x19, x27]\n"
-    "beq 3f\n"
-    "2:\n"
-    "mov v3.16b, v13.16b\n"
-    "ldr x7, [%[inptrs], 64]\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "ldr x17, [%[inptrs], 24]\n"
-    "fmla v22.4s, v27.4s, v12.4s\n"
-    "ldr q30, [x7, x27]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr x21, [%[inptrs], 192]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 152]\n"
-    "fmla v18.4s, v28.4s, v11.4s\n"
-    "ldr q24, [x17, x27]\n"
-    "fmla v22.4s, v25.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 112]\n"
-    "fmla v23.4s, v16.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 72]\n"
-    "fmla v17.4s, v16.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 32]\n"
-    "fmla v18.4s, v25.4s, v6.4s\n"
-    "ldr q31, [x21, x27]\n"
-    "fmla v22.4s, v16.4s, v11.4s\n"
-    "ldr x22, [%[inptrs], 240]\n"
-    "fmla v23.4s, v15.4s, v11.4s\n"
-    "ldr x21, [%[inptrs], 200]\n"
-    "fmla v14.4s, v15.4s, v12.4s\n"
-    "ldr x23, [%[outptrs], 0]\n"
-    "fmla v18.4s, v16.4s, v8.4s\n"
-    "ldr q25, [x20, x27]\n"
-    "fmla v22.4s, v21.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v19.4s, v21.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 32]\n"
-    "fmla v0.4s, v21.4s, v12.4s\n"
-    "ldr q21, [x19, x27]\n"
-    "fmla v18.4s, v15.4s, v10.4s\n"
-    "ldr q20, [x7, x27]\n"
-    "fmla v22.4s, v29.4s, v8.4s\n"
-    "ldr x19, [%[inptrs], 120]\n"
-    "fmla v23.4s, v29.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 80]\n"
-    "fmla v19.4s, v29.4s, v11.4s\n"
-    "ldr x25, [%[outptrs], 64]\n"
-    "fmla v18.4s, v29.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 96]\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "ldr q26, [x17, x27]\n"
-    "fmla v22.4s, v30.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 40]\n"
-    "fmla v23.4s, v30.4s, v8.4s\n"
-    "subs x16, x16, #1\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "fmla v14.4s, v30.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr q27, [x22, x27]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "ldr x22, [%[inptrs], 248]\n"
-    "fmla v23.4s, v24.4s, v10.4s\n"
-    "fmla v19.4s, v31.4s, v6.4s\n"
-    "fmla v14.4s, v24.4s, v11.4s\n"
-    "ldr q30, [x21, x27]\n"
-    "fmla v0.4s, v31.4s, v9.4s\n"
-    "ldr q24, [x20, x27]\n"
-    "fmla v22.4s, v25.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 208]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "fmla v1.4s, v25.4s, v9.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v21.4s, v4.4s\n"
-    "fmla v22.4s, v21.4s, v7.4s\n"
-    "fmla v23.4s, v21.4s, v5.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v14.4s, v21.4s, v6.4s\n"
-    "fmla v17.4s, v21.4s, v8.4s\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "str q18, [x23, x28]\n"
-    "mov v16.16b, v13.16b\n"
-    "fmla v2.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 8]\n"
-    "fmla v23.4s, v20.4s, v7.4s\n"
-    "fmla v14.4s, v20.4s, v8.4s\n"
-    "fmla v16.4s, v25.4s, v12.4s\n"
-    "ldr q25, [x19, x27]\n"
-    "fmla v17.4s, v20.4s, v10.4s\n"
-    "ldr x19, [%[inptrs], 128]\n"
-    "fmla v2.4s, v20.4s, v11.4s\n"
-    "fmla v3.4s, v20.4s, v9.4s\n"
-    "fmla v14.4s, v26.4s, v10.4s\n"
-    "fmla v0.4s, v27.4s, v6.4s\n"
-    "mov v15.16b, v13.16b\n"
-    "fmla v19.4s, v30.4s, v5.4s\n"
-    "fmla v1.4s, v30.4s, v6.4s\n"
-    "fmla v16.4s, v30.4s, v9.4s\n"
-    "fmla v3.4s, v26.4s, v11.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "ldr q27, [x17, x27]\n"
-    "fmla v0.4s, v30.4s, v8.4s\n"
-    "ldr q28, [x22, x27]\n"
-    "fmla v22.4s, v24.4s, v4.4s\n"
-    "ldr x7, [%[inptrs], 88]\n"
-    "fmla v19.4s, v24.4s, v7.4s\n"
-    "ldr x22, [%[inptrs], 256]\n"
-    "fmla v17.4s, v24.4s, v5.4s\n"
-    "ldr x17, [%[inptrs], 0]\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v8.4s\n"
-    "str q22, [x24, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v2.4s, v24.4s, v6.4s\n"
-    "ldr x24, [%[outptrs], 40]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "fmla v18.4s, v20.4s, v12.4s\n"
-    "ldr q22, [x21, x27]\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 216]\n"
-    "fmla v17.4s, v25.4s, v7.4s\n"
-    "fmla v14.4s, v25.4s, v5.4s\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v8.4s\n"
-    "fmla v3.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "str q23, [x23, x28]\n"
-    "mov v21.16b, v13.16b\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "fmla v2.4s, v29.4s, v10.4s\n"
-    "fmla v21.4s, v24.4s, v12.4s\n"
-    "ldr q30, [x20, x27]\n"
-    "fmla v3.4s, v29.4s, v8.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "ldr q31, [x19, x27]\n"
-    "fmla v0.4s, v28.4s, v5.4s\n"
-    "ldr x19, [%[inptrs], 136]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "ldr q26, [x7, x27]\n"
-    "fmla v3.4s, v27.4s, v10.4s\n"
-    "ldr q23, [x22, x27]\n"
-    "fmla v19.4s, v22.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 264]\n"
-    "fmla v0.4s, v22.4s, v7.4s\n"
-    "ldr x7, [%[inptrs], 48]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "fmla v16.4s, v22.4s, v8.4s\n"
-    "fmla v15.4s, v22.4s, v6.4s\n"
-    "fmla v21.4s, v22.4s, v9.4s\n"
-    "str q19, [x25, x28]\n"
-    "mov v24.16b, v13.16b\n"
-    "mov v20.16b, v13.16b\n"
-    "ldr q27, [x21, x27]\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 224]\n"
-    "fmla v24.4s, v25.4s, v12.4s\n"
-    "ldr q28, [x20, x27]\n"
-    "fmla v1.4s, v30.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v2.4s, v30.4s, v5.4s\n"
-    "ldr x25, [%[outptrs], 72]\n"
-    "str q17, [x24, x28]\n"
-    "fmla v16.4s, v30.4s, v10.4s\n"
-    "fmla v15.4s, v30.4s, v8.4s\n"
-    "ldr q22, [x19, x27]\n"
-    "fmla v18.4s, v30.4s, v6.4s\n"
-    "ldr x24, [%[outptrs], 48]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr x19, [%[inptrs], 96]\n"
-    "fmla v24.4s, v30.4s, v9.4s\n"
-    "fmla v20.4s, v30.4s, v12.4s\n"
-    "fmla v14.4s, v31.4s, v4.4s\n"
-    "ldr q30, [x22, x27]\n"
-    "fmla v2.4s, v31.4s, v7.4s\n"
-    "ldr q19, [x21, x27]\n"
-    "fmla v3.4s, v31.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 272]\n"
-    "fmla v15.4s, v31.4s, v10.4s\n"
-    "ldr x21, [%[inptrs], 232]\n"
-    "str q14, [x23, x28]\n"
-    "fmla v18.4s, v31.4s, v8.4s\n"
-    "fmla v24.4s, v31.4s, v11.4s\n"
-    "ldr q31, [x20, x27]\n"
-    "fmla v3.4s, v26.4s, v7.4s\n"
-    "ldr q17, [x22, x27]\n"
-    "fmla v0.4s, v23.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 280]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr q14, [x21, x27]\n"
-    "fmla v16.4s, v23.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 24]\n"
-    "fmla v21.4s, v23.4s, v6.4s\n"
-    "ldr q26, [x22, x27]\n"
-    "str q0, [x26, x28]\n"
-    "fmla v1.4s, v27.4s, v4.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "ldr q13, [%[wbptr]]\n"
-    "fmla v16.4s, v27.4s, v7.4s\n"
-    "ldr x26, [%[outptrs], 104]\n"
-    "fmla v21.4s, v27.4s, v8.4s\n"
-    "add x27, x27, #16\n"
-    "str q1, [x25, x28]\n"
-    "fmla v24.4s, v27.4s, v6.4s\n"
-    "fmla v20.4s, v27.4s, v9.4s\n"
-    "ldr q12, [%[wbptr], #16]\n"
-    "fmla v2.4s, v28.4s, v4.4s\n"
-    "ldr q29, [x17, x27]\n"
-    "fmla v15.4s, v28.4s, v7.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v18.4s, v28.4s, v5.4s\n"
-    "ldr x25, [%[outptrs], 80]\n"
-    "fmla v21.4s, v28.4s, v10.4s\n"
-    "ldr x17, [%[inptrs], 8]\n"
-    "str q2, [x24, x28]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "fmla v20.4s, v28.4s, v11.4s\n"
-    "ldr q9, [%[wbptr], #64]\n"
-    "fmla v3.4s, v22.4s, v4.4s\n"
-    "ldr q28, [x17, x27]\n"
-    "fmla v18.4s, v22.4s, v7.4s\n"
-    "ldr q25, [x19, x27]\n"
-    "fmla v24.4s, v22.4s, v10.4s\n"
-    "ldr x24, [%[outptrs], 56]\n"
-    "fmla v16.4s, v30.4s, v4.4s\n"
-    "ldr q11, [%[wbptr], #32]\n"
-    "str q3, [x23, x28]\n"
-    "fmla v21.4s, v30.4s, v5.4s\n"
-    "fmla v20.4s, v30.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 56]\n"
-    "fmla v15.4s, v19.4s, v4.4s\n"
-    "ldr x17, [%[inptrs], 16]\n"
-    "str q16, [x26, x28]\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v21.4s, v19.4s, v7.4s\n"
-    "ldr q16, [x7, x27]\n"
-    "fmla v20.4s, v19.4s, v8.4s\n"
-    "ldr q6, [%[wbptr], #112]\n"
-    "str q15, [x25, x28]\n"
-    "fmla v18.4s, v31.4s, v4.4s\n"
-    "fmla v24.4s, v31.4s, v7.4s\n"
-    "ldr q15, [x17, x27]\n"
-    "fmla v21.4s, v17.4s, v4.4s\n"
-    "ldr x25, [%[outptrs], 88]\n"
-    "fmla v20.4s, v31.4s, v10.4s\n"
-    "ldr q8, [%[wbptr], #80]\n"
-    "str q18, [x24, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v24.4s, v14.4s, v4.4s\n"
-    "ldr x26, [%[outptrs], 112]\n"
-    "mov v22.16b, v13.16b\n"
-    "ldr x20, [%[inptrs], 144]\n"
-    "str q21, [x26, x28]\n"
-    "fmla v20.4s, v17.4s, v5.4s\n"
-    "mov v23.16b, v13.16b\n"
-    "ldr q10, [%[wbptr], #48]\n"
-    "str q24, [x25, x28]\n"
-    "mov v19.16b, v13.16b\n"
-    "mov v17.16b, v13.16b\n"
-    "ldr q21, [x20, x27]\n"
-    "fmla v20.4s, v14.4s, v7.4s\n"
-    "ldr q5, [%[wbptr], #128]\n"
-    "mov v14.16b, v13.16b\n"
-    "ldr x26, [%[outptrs], 120]\n"
-    "mov v0.16b, v13.16b\n"
-    "ldr x19, [%[inptrs], 104]\n"
-    "mov v1.16b, v13.16b\n"
-    "mov v2.16b, v13.16b\n"
-    "fmla v20.4s, v26.4s, v4.4s\n"
-    "ldr q7, [%[wbptr], #96]\n"
-    "fmla v18.4s, v29.4s, v12.4s\n"
-    "ldr q29, [x19, x27]\n"
-    "str q20, [x26, x28]\n"
-    "ldr q4, [%[wbptr], #144]\n"
-    "add x28, x28, #16\n"
-    "bne 2b\n"
-    "3:\n"
-    "mov v3.16b, v13.16b\n"
-    "ldr x7, [%[inptrs], 64]\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "ldr x17, [%[inptrs], 24]\n"
-    "fmla v22.4s, v27.4s, v12.4s\n"
-    "ldr q30, [x7, x27]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr x21, [%[inptrs], 192]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 152]\n"
-    "fmla v18.4s, v28.4s, v11.4s\n"
-    "ldr q24, [x17, x27]\n"
-    "fmla v22.4s, v25.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 112]\n"
-    "fmla v23.4s, v16.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 72]\n"
-    "fmla v17.4s, v16.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 32]\n"
-    "fmla v18.4s, v25.4s, v6.4s\n"
-    "ldr q31, [x21, x27]\n"
-    "fmla v22.4s, v16.4s, v11.4s\n"
-    "ldr x22, [%[inptrs], 240]\n"
-    "fmla v23.4s, v15.4s, v11.4s\n"
-    "ldr x21, [%[inptrs], 200]\n"
-    "fmla v14.4s, v15.4s, v12.4s\n"
-    "ldr x23, [%[outptrs], 0]\n"
-    "fmla v18.4s, v16.4s, v8.4s\n"
-    "ldr q25, [x20, x27]\n"
-    "fmla v22.4s, v21.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v19.4s, v21.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 32]\n"
-    "fmla v0.4s, v21.4s, v12.4s\n"
-    "ldr q21, [x19, x27]\n"
-    "fmla v18.4s, v15.4s, v10.4s\n"
-    "ldr q20, [x7, x27]\n"
-    "fmla v22.4s, v29.4s, v8.4s\n"
-    "ldr x19, [%[inptrs], 120]\n"
-    "fmla v23.4s, v29.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 80]\n"
-    "fmla v19.4s, v29.4s, v11.4s\n"
-    "ldr x25, [%[outptrs], 64]\n"
-    "fmla v18.4s, v29.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 96]\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "ldr q26, [x17, x27]\n"
-    "fmla v22.4s, v30.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 40]\n"
-    "fmla v23.4s, v30.4s, v8.4s\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "fmla v14.4s, v30.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "mov v16.16b, v13.16b\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "fmla v19.4s, v31.4s, v6.4s\n"
-    "fmla v0.4s, v31.4s, v9.4s\n"
-    "mov v15.16b, v13.16b\n"
-    "fmla v23.4s, v24.4s, v10.4s\n"
-    "fmla v14.4s, v24.4s, v11.4s\n"
-    "ldr q27, [x22, x27]\n"
-    "fmla v22.4s, v25.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 248]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "fmla v1.4s, v25.4s, v9.4s\n"
-    "fmla v16.4s, v25.4s, v12.4s\n"
-    "ldr q30, [x21, x27]\n"
-    "fmla v18.4s, v21.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 208]\n"
-    "fmla v22.4s, v21.4s, v7.4s\n"
-    "fmla v23.4s, v21.4s, v5.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v17.4s, v21.4s, v8.4s\n"
-    "fmla v14.4s, v21.4s, v6.4s\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "str q18, [x23, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v2.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 8]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "ldr q24, [x20, x27]\n"
-    "fmla v23.4s, v20.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v17.4s, v20.4s, v10.4s\n"
-    "fmla v14.4s, v20.4s, v8.4s\n"
-    "fmla v2.4s, v20.4s, v11.4s\n"
-    "fmla v3.4s, v20.4s, v9.4s\n"
-    "fmla v18.4s, v20.4s, v12.4s\n"
-    "ldr q25, [x19, x27]\n"
-    "fmla v0.4s, v27.4s, v6.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v14.4s, v26.4s, v10.4s\n"
-    "ldr x19, [%[inptrs], 128]\n"
-    "fmla v3.4s, v26.4s, v11.4s\n"
-    "ldr q27, [x17, x27]\n"
-    "fmla v19.4s, v30.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 88]\n"
-    "fmla v0.4s, v30.4s, v8.4s\n"
-    "fmla v1.4s, v30.4s, v6.4s\n"
-    "fmla v16.4s, v30.4s, v9.4s\n"
-    "ldr q28, [x22, x27]\n"
-    "fmla v22.4s, v24.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 256]\n"
-    "fmla v19.4s, v24.4s, v7.4s\n"
-    "fmla v17.4s, v24.4s, v5.4s\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v8.4s\n"
-    "fmla v2.4s, v24.4s, v6.4s\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "str q22, [x24, x28]\n"
-    "mov v21.16b, v13.16b\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 40]\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v17.4s, v25.4s, v7.4s\n"
-    "fmla v21.4s, v24.4s, v12.4s\n"
-    "ldr q22, [x21, x27]\n"
-    "fmla v14.4s, v25.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 216]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v8.4s\n"
-    "str q23, [x23, x28]\n"
-    "mov v24.16b, v13.16b\n"
-    "mov v20.16b, v13.16b\n"
-    "ldr x23, [%[outptrs], 16]\n"
-    "fmla v3.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "fmla v24.4s, v25.4s, v12.4s\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr q30, [x20, x27]\n"
-    "fmla v2.4s, v29.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmla v3.4s, v29.4s, v8.4s\n"
-    "fmla v0.4s, v28.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "ldr q31, [x19, x27]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "ldr q26, [x7, x27]\n"
-    "fmla v19.4s, v22.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 136]\n"
-    "fmla v3.4s, v27.4s, v10.4s\n"
-    "ldr q23, [x22, x27]\n"
-    "fmla v0.4s, v22.4s, v7.4s\n"
-    "ldr x22, [%[inptrs], 264]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "fmla v16.4s, v22.4s, v8.4s\n"
-    "str q19, [x25, x28]\n"
-    "fmla v15.4s, v22.4s, v6.4s\n"
-    "fmla v21.4s, v22.4s, v9.4s\n"
-    "ldr q27, [x21, x27]\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "ldr q28, [x20, x27]\n"
-    "fmla v1.4s, v30.4s, v7.4s\n"
-    "ldr x21, [%[inptrs], 224]\n"
-    "fmla v2.4s, v30.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v16.4s, v30.4s, v10.4s\n"
-    "ldr x25, [%[outptrs], 72]\n"
-    "str q17, [x24, x28]\n"
-    "fmla v15.4s, v30.4s, v8.4s\n"
-    "fmla v18.4s, v30.4s, v6.4s\n"
-    "ldr q22, [x19, x27]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr x24, [%[outptrs], 48]\n"
-    "fmla v24.4s, v30.4s, v9.4s\n"
-    "fmla v20.4s, v30.4s, v12.4s\n"
-    "fmla v14.4s, v31.4s, v4.4s\n"
-    "ldr q30, [x22, x27]\n"
-    "fmla v2.4s, v31.4s, v7.4s\n"
-    "ldr q19, [x21, x27]\n"
-    "fmla v3.4s, v31.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 272]\n"
-    "fmla v15.4s, v31.4s, v10.4s\n"
-    "ldr x21, [%[inptrs], 232]\n"
-    "str q14, [x23, x28]\n"
-    "fmla v18.4s, v31.4s, v8.4s\n"
-    "fmla v24.4s, v31.4s, v11.4s\n"
-    "ldr q31, [x20, x27]\n"
-    "fmla v3.4s, v26.4s, v7.4s\n"
-    "ldr q17, [x22, x27]\n"
-    "fmla v0.4s, v23.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 280]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr q14, [x21, x27]\n"
-    "fmla v16.4s, v23.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 24]\n"
-    "fmla v21.4s, v23.4s, v6.4s\n"
-    "ldr q26, [x22, x27]\n"
-    "str q0, [x26, x28]\n"
-    "fmla v1.4s, v27.4s, v4.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 104]\n"
-    "fmla v16.4s, v27.4s, v7.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v21.4s, v27.4s, v8.4s\n"
-    "fmla v24.4s, v27.4s, v6.4s\n"
-    "str q1, [x25, x28]\n"
-    "fmla v20.4s, v27.4s, v9.4s\n"
-    "fmla v2.4s, v28.4s, v4.4s\n"
-    "ldr x25, [%[outptrs], 80]\n"
-    "fmla v15.4s, v28.4s, v7.4s\n"
-    "fmla v18.4s, v28.4s, v5.4s\n"
-    "fmla v21.4s, v28.4s, v10.4s\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "fmla v20.4s, v28.4s, v11.4s\n"
-    "fmla v3.4s, v22.4s, v4.4s\n"
-    "str q2, [x24, x28]\n"
-    "fmla v16.4s, v30.4s, v4.4s\n"
-    "fmla v18.4s, v22.4s, v7.4s\n"
-    "ldr x24, [%[outptrs], 56]\n"
-    "fmla v24.4s, v22.4s, v10.4s\n"
-    "fmla v21.4s, v30.4s, v5.4s\n"
-    "str q3, [x23, x28]\n"
-    "fmla v20.4s, v30.4s, v6.4s\n"
-    "str q16, [x26, x28]\n"
-    "fmla v15.4s, v19.4s, v4.4s\n"
-    "fmla v18.4s, v31.4s, v4.4s\n"
-    "ldr x26, [%[outptrs], 112]\n"
-    "fmla v21.4s, v19.4s, v7.4s\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v8.4s\n"
-    "str q15, [x25, x28]\n"
-    "str q18, [x24, x28]\n"
-    "ldr x25, [%[outptrs], 88]\n"
-    "fmla v24.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v17.4s, v4.4s\n"
-    "fmla v20.4s, v31.4s, v10.4s\n"
-    "str q21, [x26, x28]\n"
-    "fmla v20.4s, v17.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 120]\n"
-    "fmla v24.4s, v14.4s, v4.4s\n"
-    "fmla v20.4s, v14.4s, v7.4s\n"
-    "str q24, [x25, x28]\n"
-    "fmla v20.4s, v26.4s, v4.4s\n"
-    "str q20, [x26, x28]\n"
-    "add x28, x28, #16\n"
-    "4:\n"
-    "cbz x15, 7f\n"
-    "ldr s13, [%[wbptr]]\n"
-    "mov v18.16b, v13.16b\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "mov v22.16b, v13.16b\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "mov v23.16b, v13.16b\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "mov v19.16b, v13.16b\n"
-    "ldr s9, [%[wbptr], #16]\n"
-    "mov v17.16b, v13.16b\n"
-    "ldr s8, [%[wbptr], #20]\n"
-    "mov v14.16b, v13.16b\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "mov v0.16b, v13.16b\n"
-    "ldr s6, [%[wbptr], #28]\n"
-    "mov v1.16b, v13.16b\n"
-    "ldr s5, [%[wbptr], #32]\n"
-    "mov v2.16b, v13.16b\n"
-    "ldr s4, [%[wbptr], #36]\n"
-    "ldr x17, [%[inptrs], 0]\n"
-    "ldr x7, [%[inptrs], 48]\n"
-    "ldr x19, [%[inptrs], 96]\n"
-    "ldr x20, [%[inptrs], 144]\n"
-    "subs x15, x15, #1\n"
-    "ldr s29, [x17, x27]\n"
-    "fmla v18.4s, v29.4s, v12.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "ldr s25, [x19, x27]\n"
-    "ldr x17, [%[inptrs], 8]\n"
-    "ldr s21, [x20, x27]\n"
-    "ldr x7, [%[inptrs], 56]\n"
-    "ldr s28, [x17, x27]\n"
-    "ldr x19, [%[inptrs], 104]\n"
-    "ldr s16, [x7, x27]\n"
-    "ldr x17, [%[inptrs], 16]\n"
-    "ldr s29, [x19, x27]\n"
-    "ldr s15, [x17, x27]\n"
-    "beq 6f\n"
-    "5:\n"
-    "mov v3.16b, v13.16b\n"
-    "ldr x7, [%[inptrs], 64]\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "ldr x17, [%[inptrs], 24]\n"
-    "fmla v22.4s, v27.4s, v12.4s\n"
-    "ldr s30, [x7, x27]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr x21, [%[inptrs], 192]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 152]\n"
-    "fmla v18.4s, v28.4s, v11.4s\n"
-    "ldr s24, [x17, x27]\n"
-    "fmla v22.4s, v25.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 112]\n"
-    "fmla v23.4s, v16.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 72]\n"
-    "fmla v17.4s, v16.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 32]\n"
-    "fmla v18.4s, v25.4s, v6.4s\n"
-    "ldr s31, [x21, x27]\n"
-    "fmla v22.4s, v16.4s, v11.4s\n"
-    "ldr x22, [%[inptrs], 240]\n"
-    "fmla v23.4s, v15.4s, v11.4s\n"
-    "ldr x21, [%[inptrs], 200]\n"
-    "fmla v14.4s, v15.4s, v12.4s\n"
-    "ldr x23, [%[outptrs], 0]\n"
-    "fmla v18.4s, v16.4s, v8.4s\n"
-    "ldr s25, [x20, x27]\n"
-    "fmla v22.4s, v21.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v19.4s, v21.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 32]\n"
-    "fmla v0.4s, v21.4s, v12.4s\n"
-    "ldr s21, [x19, x27]\n"
-    "fmla v18.4s, v15.4s, v10.4s\n"
-    "ldr s20, [x7, x27]\n"
-    "fmla v22.4s, v29.4s, v8.4s\n"
-    "ldr x19, [%[inptrs], 120]\n"
-    "fmla v23.4s, v29.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 80]\n"
-    "fmla v19.4s, v29.4s, v11.4s\n"
-    "ldr x25, [%[outptrs], 64]\n"
-    "fmla v18.4s, v29.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 96]\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "ldr s26, [x17, x27]\n"
-    "fmla v22.4s, v30.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 40]\n"
-    "fmla v23.4s, v30.4s, v8.4s\n"
-    "subs x15, x15, #1\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "fmla v14.4s, v30.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "ldr s27, [x22, x27]\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "ldr x22, [%[inptrs], 248]\n"
-    "fmla v23.4s, v24.4s, v10.4s\n"
-    "fmla v19.4s, v31.4s, v6.4s\n"
-    "fmla v14.4s, v24.4s, v11.4s\n"
-    "ldr s30, [x21, x27]\n"
-    "fmla v0.4s, v31.4s, v9.4s\n"
-    "ldr s24, [x20, x27]\n"
-    "fmla v22.4s, v25.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 208]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "fmla v1.4s, v25.4s, v9.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v21.4s, v4.4s\n"
-    "fmla v22.4s, v21.4s, v7.4s\n"
-    "fmla v23.4s, v21.4s, v5.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v14.4s, v21.4s, v6.4s\n"
-    "fmla v17.4s, v21.4s, v8.4s\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "str s18, [x23, x28]\n"
-    "mov v16.16b, v13.16b\n"
-    "fmla v2.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 8]\n"
-    "fmla v23.4s, v20.4s, v7.4s\n"
-    "fmla v14.4s, v20.4s, v8.4s\n"
-    "fmla v16.4s, v25.4s, v12.4s\n"
-    "ldr s25, [x19, x27]\n"
-    "fmla v17.4s, v20.4s, v10.4s\n"
-    "ldr x19, [%[inptrs], 128]\n"
-    "fmla v2.4s, v20.4s, v11.4s\n"
-    "fmla v3.4s, v20.4s, v9.4s\n"
-    "fmla v14.4s, v26.4s, v10.4s\n"
-    "fmla v0.4s, v27.4s, v6.4s\n"
-    "mov v15.16b, v13.16b\n"
-    "fmla v19.4s, v30.4s, v5.4s\n"
-    "fmla v1.4s, v30.4s, v6.4s\n"
-    "fmla v16.4s, v30.4s, v9.4s\n"
-    "fmla v3.4s, v26.4s, v11.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "ldr s27, [x17, x27]\n"
-    "fmla v0.4s, v30.4s, v8.4s\n"
-    "ldr s28, [x22, x27]\n"
-    "fmla v22.4s, v24.4s, v4.4s\n"
-    "ldr x7, [%[inptrs], 88]\n"
-    "fmla v19.4s, v24.4s, v7.4s\n"
-    "ldr x22, [%[inptrs], 256]\n"
-    "fmla v17.4s, v24.4s, v5.4s\n"
-    "ldr x17, [%[inptrs], 0]\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v8.4s\n"
-    "str s22, [x24, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v2.4s, v24.4s, v6.4s\n"
-    "ldr x24, [%[outptrs], 40]\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "fmla v18.4s, v20.4s, v12.4s\n"
-    "ldr s22, [x21, x27]\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 216]\n"
-    "fmla v17.4s, v25.4s, v7.4s\n"
-    "fmla v14.4s, v25.4s, v5.4s\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v8.4s\n"
-    "fmla v3.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "str s23, [x23, x28]\n"
-    "mov v21.16b, v13.16b\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 16]\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "fmla v2.4s, v29.4s, v10.4s\n"
-    "fmla v21.4s, v24.4s, v12.4s\n"
-    "ldr s30, [x20, x27]\n"
-    "fmla v3.4s, v29.4s, v8.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "ldr s31, [x19, x27]\n"
-    "fmla v0.4s, v28.4s, v5.4s\n"
-    "ldr x19, [%[inptrs], 136]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "ldr s26, [x7, x27]\n"
-    "fmla v3.4s, v27.4s, v10.4s\n"
-    "ldr s23, [x22, x27]\n"
-    "fmla v19.4s, v22.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 264]\n"
-    "fmla v0.4s, v22.4s, v7.4s\n"
-    "ldr x7, [%[inptrs], 48]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "fmla v16.4s, v22.4s, v8.4s\n"
-    "fmla v15.4s, v22.4s, v6.4s\n"
-    "fmla v21.4s, v22.4s, v9.4s\n"
-    "str s19, [x25, x28]\n"
-    "mov v24.16b, v13.16b\n"
-    "mov v20.16b, v13.16b\n"
-    "ldr s27, [x21, x27]\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 224]\n"
-    "fmla v24.4s, v25.4s, v12.4s\n"
-    "ldr s28, [x20, x27]\n"
-    "fmla v1.4s, v30.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v2.4s, v30.4s, v5.4s\n"
-    "ldr x25, [%[outptrs], 72]\n"
-    "str s17, [x24, x28]\n"
-    "fmla v16.4s, v30.4s, v10.4s\n"
-    "fmla v15.4s, v30.4s, v8.4s\n"
-    "ldr s22, [x19, x27]\n"
-    "fmla v18.4s, v30.4s, v6.4s\n"
-    "ldr x24, [%[outptrs], 48]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr x19, [%[inptrs], 96]\n"
-    "fmla v24.4s, v30.4s, v9.4s\n"
-    "fmla v20.4s, v30.4s, v12.4s\n"
-    "fmla v14.4s, v31.4s, v4.4s\n"
-    "ldr s30, [x22, x27]\n"
-    "fmla v2.4s, v31.4s, v7.4s\n"
-    "ldr s19, [x21, x27]\n"
-    "fmla v3.4s, v31.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 272]\n"
-    "fmla v15.4s, v31.4s, v10.4s\n"
-    "ldr x21, [%[inptrs], 232]\n"
-    "str s14, [x23, x28]\n"
-    "fmla v18.4s, v31.4s, v8.4s\n"
-    "fmla v24.4s, v31.4s, v11.4s\n"
-    "ldr s31, [x20, x27]\n"
-    "fmla v3.4s, v26.4s, v7.4s\n"
-    "ldr s17, [x22, x27]\n"
-    "fmla v0.4s, v23.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 280]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr s14, [x21, x27]\n"
-    "fmla v16.4s, v23.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 24]\n"
-    "fmla v21.4s, v23.4s, v6.4s\n"
-    "ldr s26, [x22, x27]\n"
-    "str s0, [x26, x28]\n"
-    "fmla v1.4s, v27.4s, v4.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "ldr s13, [%[wbptr]]\n"
-    "fmla v16.4s, v27.4s, v7.4s\n"
-    "ldr x26, [%[outptrs], 104]\n"
-    "fmla v21.4s, v27.4s, v8.4s\n"
-    "add x27, x27, #4\n"
-    "str s1, [x25, x28]\n"
-    "fmla v24.4s, v27.4s, v6.4s\n"
-    "fmla v20.4s, v27.4s, v9.4s\n"
-    "ldr s12, [%[wbptr], #4]\n"
-    "fmla v2.4s, v28.4s, v4.4s\n"
-    "ldr s29, [x17, x27]\n"
-    "fmla v15.4s, v28.4s, v7.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v18.4s, v28.4s, v5.4s\n"
-    "ldr x25, [%[outptrs], 80]\n"
-    "fmla v21.4s, v28.4s, v10.4s\n"
-    "ldr x17, [%[inptrs], 8]\n"
-    "str s2, [x24, x28]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "fmla v20.4s, v28.4s, v11.4s\n"
-    "ldr s9, [%[wbptr], #16]\n"
-    "fmla v3.4s, v22.4s, v4.4s\n"
-    "ldr s28, [x17, x27]\n"
-    "fmla v18.4s, v22.4s, v7.4s\n"
-    "ldr s25, [x19, x27]\n"
-    "fmla v24.4s, v22.4s, v10.4s\n"
-    "ldr x24, [%[outptrs], 56]\n"
-    "fmla v16.4s, v30.4s, v4.4s\n"
-    "ldr s11, [%[wbptr], #8]\n"
-    "str s3, [x23, x28]\n"
-    "fmla v21.4s, v30.4s, v5.4s\n"
-    "fmla v20.4s, v30.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 56]\n"
-    "fmla v15.4s, v19.4s, v4.4s\n"
-    "ldr x17, [%[inptrs], 16]\n"
-    "str s16, [x26, x28]\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v21.4s, v19.4s, v7.4s\n"
-    "ldr s16, [x7, x27]\n"
-    "fmla v20.4s, v19.4s, v8.4s\n"
-    "ldr s6, [%[wbptr], #28]\n"
-    "str s15, [x25, x28]\n"
-    "fmla v18.4s, v31.4s, v4.4s\n"
-    "fmla v24.4s, v31.4s, v7.4s\n"
-    "ldr s15, [x17, x27]\n"
-    "fmla v21.4s, v17.4s, v4.4s\n"
-    "ldr x25, [%[outptrs], 88]\n"
-    "fmla v20.4s, v31.4s, v10.4s\n"
-    "ldr s8, [%[wbptr], #20]\n"
-    "str s18, [x24, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v24.4s, v14.4s, v4.4s\n"
-    "ldr x26, [%[outptrs], 112]\n"
-    "mov v22.16b, v13.16b\n"
-    "ldr x20, [%[inptrs], 144]\n"
-    "str s21, [x26, x28]\n"
-    "fmla v20.4s, v17.4s, v5.4s\n"
-    "mov v23.16b, v13.16b\n"
-    "ldr s10, [%[wbptr], #12]\n"
-    "str s24, [x25, x28]\n"
-    "mov v19.16b, v13.16b\n"
-    "mov v17.16b, v13.16b\n"
-    "ldr s21, [x20, x27]\n"
-    "fmla v20.4s, v14.4s, v7.4s\n"
-    "ldr s5, [%[wbptr], #32]\n"
-    "mov v14.16b, v13.16b\n"
-    "ldr x26, [%[outptrs], 120]\n"
-    "mov v0.16b, v13.16b\n"
-    "ldr x19, [%[inptrs], 104]\n"
-    "mov v1.16b, v13.16b\n"
-    "mov v2.16b, v13.16b\n"
-    "fmla v20.4s, v26.4s, v4.4s\n"
-    "ldr s7, [%[wbptr], #24]\n"
-    "fmla v18.4s, v29.4s, v12.4s\n"
-    "ldr s29, [x19, x27]\n"
-    "str s20, [x26, x28]\n"
-    "ldr s4, [%[wbptr], #36]\n"
-    "add x28, x28, #4\n"
-    "bne 5b\n"
-    "6:\n"
-    "mov v3.16b, v13.16b\n"
-    "ldr x7, [%[inptrs], 64]\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "ldr x17, [%[inptrs], 24]\n"
-    "fmla v22.4s, v27.4s, v12.4s\n"
-    "ldr s30, [x7, x27]\n"
-    "fmla v23.4s, v28.4s, v12.4s\n"
-    "ldr x21, [%[inptrs], 192]\n"
-    "fmla v19.4s, v25.4s, v12.4s\n"
-    "ldr x20, [%[inptrs], 152]\n"
-    "fmla v18.4s, v28.4s, v11.4s\n"
-    "ldr s24, [x17, x27]\n"
-    "fmla v22.4s, v25.4s, v9.4s\n"
-    "ldr x19, [%[inptrs], 112]\n"
-    "fmla v23.4s, v16.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 72]\n"
-    "fmla v17.4s, v16.4s, v12.4s\n"
-    "ldr x17, [%[inptrs], 32]\n"
-    "fmla v18.4s, v25.4s, v6.4s\n"
-    "ldr s31, [x21, x27]\n"
-    "fmla v22.4s, v16.4s, v11.4s\n"
-    "ldr x22, [%[inptrs], 240]\n"
-    "fmla v23.4s, v15.4s, v11.4s\n"
-    "ldr x21, [%[inptrs], 200]\n"
-    "fmla v14.4s, v15.4s, v12.4s\n"
-    "ldr x23, [%[outptrs], 0]\n"
-    "fmla v18.4s, v16.4s, v8.4s\n"
-    "ldr s25, [x20, x27]\n"
-    "fmla v22.4s, v21.4s, v6.4s\n"
-    "ldr x20, [%[inptrs], 160]\n"
-    "fmla v19.4s, v21.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 32]\n"
-    "fmla v0.4s, v21.4s, v12.4s\n"
-    "ldr s21, [x19, x27]\n"
-    "fmla v18.4s, v15.4s, v10.4s\n"
-    "ldr s20, [x7, x27]\n"
-    "fmla v22.4s, v29.4s, v8.4s\n"
-    "ldr x19, [%[inptrs], 120]\n"
-    "fmla v23.4s, v29.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 80]\n"
-    "fmla v19.4s, v29.4s, v11.4s\n"
-    "ldr x25, [%[outptrs], 64]\n"
-    "fmla v18.4s, v29.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 96]\n"
-    "fmla v17.4s, v29.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "ldr s26, [x17, x27]\n"
-    "fmla v22.4s, v30.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v18.4s, v30.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 40]\n"
-    "fmla v23.4s, v30.4s, v8.4s\n"
-    "fmla v17.4s, v30.4s, v11.4s\n"
-    "fmla v14.4s, v30.4s, v9.4s\n"
-    "fmla v2.4s, v30.4s, v12.4s\n"
-    "mov v16.16b, v13.16b\n"
-    "fmla v3.4s, v24.4s, v12.4s\n"
-    "fmla v19.4s, v31.4s, v6.4s\n"
-    "fmla v0.4s, v31.4s, v9.4s\n"
-    "mov v15.16b, v13.16b\n"
-    "fmla v23.4s, v24.4s, v10.4s\n"
-    "fmla v14.4s, v24.4s, v11.4s\n"
-    "ldr s27, [x22, x27]\n"
-    "fmla v22.4s, v25.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 248]\n"
-    "fmla v19.4s, v25.4s, v8.4s\n"
-    "fmla v17.4s, v25.4s, v6.4s\n"
-    "fmla v0.4s, v25.4s, v11.4s\n"
-    "fmla v1.4s, v25.4s, v9.4s\n"
-    "fmla v16.4s, v25.4s, v12.4s\n"
-    "ldr s30, [x21, x27]\n"
-    "fmla v18.4s, v21.4s, v4.4s\n"
-    "ldr x21, [%[inptrs], 208]\n"
-    "fmla v22.4s, v21.4s, v7.4s\n"
-    "fmla v23.4s, v21.4s, v5.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v17.4s, v21.4s, v8.4s\n"
-    "fmla v14.4s, v21.4s, v6.4s\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "str s18, [x23, x28]\n"
-    "mov v18.16b, v13.16b\n"
-    "fmla v2.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 8]\n"
-    "fmla v15.4s, v21.4s, v12.4s\n"
-    "ldr s24, [x20, x27]\n"
-    "fmla v23.4s, v20.4s, v7.4s\n"
-    "ldr x20, [%[inptrs], 168]\n"
-    "fmla v17.4s, v20.4s, v10.4s\n"
-    "fmla v14.4s, v20.4s, v8.4s\n"
-    "fmla v2.4s, v20.4s, v11.4s\n"
-    "fmla v3.4s, v20.4s, v9.4s\n"
-    "fmla v18.4s, v20.4s, v12.4s\n"
-    "ldr s25, [x19, x27]\n"
-    "fmla v0.4s, v27.4s, v6.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v14.4s, v26.4s, v10.4s\n"
-    "ldr x19, [%[inptrs], 128]\n"
-    "fmla v3.4s, v26.4s, v11.4s\n"
-    "ldr s27, [x17, x27]\n"
-    "fmla v19.4s, v30.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 88]\n"
-    "fmla v0.4s, v30.4s, v8.4s\n"
-    "fmla v1.4s, v30.4s, v6.4s\n"
-    "fmla v16.4s, v30.4s, v9.4s\n"
-    "ldr s28, [x22, x27]\n"
-    "fmla v22.4s, v24.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 256]\n"
-    "fmla v19.4s, v24.4s, v7.4s\n"
-    "fmla v17.4s, v24.4s, v5.4s\n"
-    "fmla v0.4s, v24.4s, v10.4s\n"
-    "fmla v1.4s, v24.4s, v8.4s\n"
-    "fmla v2.4s, v24.4s, v6.4s\n"
-    "fmla v16.4s, v24.4s, v11.4s\n"
-    "str s22, [x24, x28]\n"
-    "mov v21.16b, v13.16b\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "ldr x24, [%[outptrs], 40]\n"
-    "fmla v23.4s, v25.4s, v4.4s\n"
-    "fmla v17.4s, v25.4s, v7.4s\n"
-    "fmla v21.4s, v24.4s, v12.4s\n"
-    "ldr s22, [x21, x27]\n"
-    "fmla v14.4s, v25.4s, v5.4s\n"
-    "ldr x21, [%[inptrs], 216]\n"
-    "fmla v1.4s, v25.4s, v10.4s\n"
-    "fmla v2.4s, v25.4s, v8.4s\n"
-    "str s23, [x23, x28]\n"
-    "mov v24.16b, v13.16b\n"
-    "mov v20.16b, v13.16b\n"
-    "ldr x23, [%[outptrs], 16]\n"
-    "fmla v3.4s, v25.4s, v6.4s\n"
-    "fmla v15.4s, v25.4s, v11.4s\n"
-    "fmla v18.4s, v25.4s, v9.4s\n"
-    "fmla v24.4s, v25.4s, v12.4s\n"
-    "fmla v14.4s, v29.4s, v7.4s\n"
-    "ldr s30, [x20, x27]\n"
-    "fmla v2.4s, v29.4s, v10.4s\n"
-    "ldr x20, [%[inptrs], 176]\n"
-    "fmla v3.4s, v29.4s, v8.4s\n"
-    "fmla v0.4s, v28.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v11.4s\n"
-    "ldr s31, [x19, x27]\n"
-    "fmla v16.4s, v28.4s, v6.4s\n"
-    "ldr s26, [x7, x27]\n"
-    "fmla v19.4s, v22.4s, v4.4s\n"
-    "ldr x19, [%[inptrs], 136]\n"
-    "fmla v3.4s, v27.4s, v10.4s\n"
-    "ldr s23, [x22, x27]\n"
-    "fmla v0.4s, v22.4s, v7.4s\n"
-    "ldr x22, [%[inptrs], 264]\n"
-    "fmla v1.4s, v22.4s, v5.4s\n"
-    "fmla v16.4s, v22.4s, v8.4s\n"
-    "str s19, [x25, x28]\n"
-    "fmla v15.4s, v22.4s, v6.4s\n"
-    "fmla v21.4s, v22.4s, v9.4s\n"
-    "ldr s27, [x21, x27]\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "ldr s28, [x20, x27]\n"
-    "fmla v1.4s, v30.4s, v7.4s\n"
-    "ldr x21, [%[inptrs], 224]\n"
-    "fmla v2.4s, v30.4s, v5.4s\n"
-    "ldr x20, [%[inptrs], 184]\n"
-    "fmla v16.4s, v30.4s, v10.4s\n"
-    "ldr x25, [%[outptrs], 72]\n"
-    "str s17, [x24, x28]\n"
-    "fmla v15.4s, v30.4s, v8.4s\n"
-    "fmla v18.4s, v30.4s, v6.4s\n"
-    "ldr s22, [x19, x27]\n"
-    "fmla v21.4s, v30.4s, v11.4s\n"
-    "ldr x24, [%[outptrs], 48]\n"
-    "fmla v24.4s, v30.4s, v9.4s\n"
-    "fmla v20.4s, v30.4s, v12.4s\n"
-    "fmla v14.4s, v31.4s, v4.4s\n"
-    "ldr s30, [x22, x27]\n"
-    "fmla v2.4s, v31.4s, v7.4s\n"
-    "ldr s19, [x21, x27]\n"
-    "fmla v3.4s, v31.4s, v5.4s\n"
-    "ldr x22, [%[inptrs], 272]\n"
-    "fmla v15.4s, v31.4s, v10.4s\n"
-    "ldr x21, [%[inptrs], 232]\n"
-    "str s14, [x23, x28]\n"
-    "fmla v18.4s, v31.4s, v8.4s\n"
-    "fmla v24.4s, v31.4s, v11.4s\n"
-    "ldr s31, [x20, x27]\n"
-    "fmla v3.4s, v26.4s, v7.4s\n"
-    "ldr s17, [x22, x27]\n"
-    "fmla v0.4s, v23.4s, v4.4s\n"
-    "ldr x22, [%[inptrs], 280]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr s14, [x21, x27]\n"
-    "fmla v16.4s, v23.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 24]\n"
-    "fmla v21.4s, v23.4s, v6.4s\n"
-    "ldr s26, [x22, x27]\n"
-    "str s0, [x26, x28]\n"
-    "fmla v1.4s, v27.4s, v4.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 104]\n"
-    "fmla v16.4s, v27.4s, v7.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v21.4s, v27.4s, v8.4s\n"
-    "fmla v24.4s, v27.4s, v6.4s\n"
-    "str s1, [x25, x28]\n"
-    "fmla v20.4s, v27.4s, v9.4s\n"
-    "fmla v2.4s, v28.4s, v4.4s\n"
-    "ldr x25, [%[outptrs], 80]\n"
-    "fmla v15.4s, v28.4s, v7.4s\n"
-    "fmla v18.4s, v28.4s, v5.4s\n"
-    "fmla v21.4s, v28.4s, v10.4s\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "fmla v20.4s, v28.4s, v11.4s\n"
-    "fmla v3.4s, v22.4s, v4.4s\n"
-    "str s2, [x24, x28]\n"
-    "fmla v16.4s, v30.4s, v4.4s\n"
-    "fmla v18.4s, v22.4s, v7.4s\n"
-    "ldr x24, [%[outptrs], 56]\n"
-    "fmla v24.4s, v22.4s, v10.4s\n"
-    "fmla v21.4s, v30.4s, v5.4s\n"
-    "str s3, [x23, x28]\n"
-    "fmla v20.4s, v30.4s, v6.4s\n"
-    "str s16, [x26, x28]\n"
-    "fmla v15.4s, v19.4s, v4.4s\n"
-    "fmla v18.4s, v31.4s, v4.4s\n"
-    "ldr x26, [%[outptrs], 112]\n"
-    "fmla v21.4s, v19.4s, v7.4s\n"
-    "fmla v24.4s, v19.4s, v5.4s\n"
-    "fmla v20.4s, v19.4s, v8.4s\n"
-    "str s15, [x25, x28]\n"
-    "str s18, [x24, x28]\n"
-    "ldr x25, [%[outptrs], 88]\n"
-    "fmla v24.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v17.4s, v4.4s\n"
-    "fmla v20.4s, v31.4s, v10.4s\n"
-    "str s21, [x26, x28]\n"
-    "fmla v20.4s, v17.4s, v5.4s\n"
-    "ldr x26, [%[outptrs], 120]\n"
-    "fmla v24.4s, v14.4s, v4.4s\n"
-    "fmla v20.4s, v14.4s, v7.4s\n"
-    "str s24, [x25, x28]\n"
-    "fmla v20.4s, v26.4s, v4.4s\n"
-    "str s20, [x26, x28]\n"
-    "add x28, x28, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x9, %[inptr0], %[input_row_stride]\n"
-    "add x28, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x16, %[outptr0], %[output_row_stride]\n"
-    "add x24, x9, %[input_row_stride]\n"
-    "add x25, x28, #64\n"
-    "add x23, x28, %[input_col_stride1]\n"
-    "add x26, x24, %[input_row_stride]\n"
-    "add x11, x23, #64\n"
-    "add x12, x23, %[input_col_stride1]\n"
-    "add x10, x26, %[input_row_stride]\n"
-    "add x13, x12, #64\n"
-    "add x14, x12, %[input_col_stride1]\n"
-    "add x27, x10, %[input_row_stride]\n"
-    "add x15, x14, #64\n"
-    "add x17, x16, %[output_row_stride]\n"
-    "add x7, x17, %[output_row_stride]\n"
-    "add x19, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x21, %[n_channels], #3\n"
-    "add x20, x19, %[output_col_stride1]\n"
-    "lsr x22, %[n_channels], #2\n"
-    "cbz x22, 4f\n"
-    "1:\n"
-    "ldr q21, [%[wbptr]]\n"
-    "subs x22, x22, #1\n"
-    "mov v7.16b, v21.16b\n"
-    "ldr q20, [%[wbptr], #16]\n"
-    "mov v3.16b, v21.16b\n"
-    "ldr q14, [%[wbptr], #32]\n"
-    "mov v6.16b, v21.16b\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "mov v15.16b, v21.16b\n"
-    "ldr q17, [%[wbptr], #64]\n"
-    "mov v2.16b, v21.16b\n"
-    "ldr q12, [%[wbptr], #80]\n"
-    "mov v5.16b, v21.16b\n"
-    "ldr q11, [%[wbptr], #96]\n"
-    "mov v0.16b, v21.16b\n"
-    "ldr q10, [%[wbptr], #112]\n"
-    "mov v16.16b, v21.16b\n"
-    "ldr q9, [%[wbptr], #128]\n"
-    "mov v1.16b, v21.16b\n"
-    "ldr q8, [%[wbptr], #144]\n"
-    "mov v4.16b, v21.16b\n"
-    "ldr q22, [%[inptr0]]\n"
-    "fmla v7.4s, v22.4s, v20.4s\n"
-    "ldr q19, [x9]\n"
-    "fmla v3.4s, v19.4s, v20.4s\n"
-    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v6.4s, v23.4s, v20.4s\n"
-    "ldr q18, [x24]\n"
-    "fmla v7.4s, v19.4s, v17.4s\n"
-    "ldr q27, [x9, %[input_col_stride1]]\n"
-    "fmla v3.4s, v18.4s, v17.4s\n"
-    "ldr q28, [%[inptr0], x28]\n"
-    "fmla v15.4s, v18.4s, v20.4s\n"
-    "ldr q25, [x26]\n"
-    "fmla v7.4s, v23.4s, v14.4s\n"
-    "ldr q22, [x24, %[input_col_stride1]]\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x8]\n"
-    "fmla v7.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "prfm pldl1keep, [x9, x8]\n"
-    "prfm pldl1keep, [%[inptr0], x25]\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "prfm pldl1keep, [x24, x8]\n"
-    "fmla v7.4s, v27.4s, v12.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "mov v18.16b, v21.16b\n"
-    "ldr q23, [x9, x28]\n"
-    "mov v19.16b, v21.16b\n"
-    "prfm pldl1keep, [x9, x25]\n"
-    "fmla v6.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x11]\n"
-    "fmla v2.4s, v27.4s, v20.4s\n"
-    "ldr q24, [%[inptr0], x23]\n"
-    "fmla v7.4s, v28.4s, v13.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v6.4s, v28.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x8]\n"
-    "fmla v5.4s, v28.4s, v20.4s\n"
-    "ldr q26, [x10]\n"
-    "fmla v3.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x25]\n"
-    "fmla v15.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x9, x11]\n"
-    "fmla v0.4s, v25.4s, v20.4s\n"
-    "ldr q25, [x26, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x13]\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, #64]\n"
-    "fmla v6.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x10, x8]\n"
-    "fmla v15.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x25]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "prfm pldl1keep, [x24, x11]\n"
-    "fmla v16.4s, v22.4s, v20.4s\n"
-    "ldr q22, [x24, x28]\n"
-    "fmla v7.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x13]\n"
-    "fmla v3.4s, v23.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v6.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, x8]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "prfm pldl1keep, [x10, x25]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "prfm pldl1keep, [x26, x11]\n"
-    "fmla v1.4s, v23.4s, v20.4s\n"
-    "ldr q23, [x9, x23]\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x24, x13]\n"
-    "fmla v5.4s, v24.4s, v14.4s\n"
-    "prfm pldl1keep, [x9, x15]\n"
-    "fmla v4.4s, v24.4s, v20.4s\n"
-    "ldr q24, [%[inptr0], x12]\n"
-    "fmla v15.4s, v26.4s, v10.4s\n"
-    "prfm pldl1keep, [x27, x25]\n"
-    "fmla v0.4s, v26.4s, v17.4s\n"
-    "ldr q29, [x27]\n"
-    "fmla v3.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x11]\n"
-    "fmla v15.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x26, x13]\n"
-    "fmla v2.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "prfm pldl1keep, [x27, x11]\n"
-    "fmla v16.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x10, x13]\n"
-    "fmla v18.4s, v25.4s, v20.4s\n"
-    "ldr q26, [x10, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, x15]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x27, x13]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x15]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x27, x15]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v5.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v22.4s, v14.4s\n"
-    "subs x22, x22, #1\n"
-    "fmla v1.4s, v22.4s, v17.4s\n"
-    "fmla v19.4s, v22.4s, v20.4s\n"
-    "mov v22.16b, v21.16b\n"
-    "fmla v6.4s, v23.4s, v11.4s\n"
-    "fmla v2.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v23.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v14.4s\n"
-    "fmla v4.4s, v23.4s, v17.4s\n"
-    "fmla v22.4s, v23.4s, v20.4s\n"
-    "ldr q27, [x26, x28]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "fmla v0.4s, v29.4s, v10.4s\n"
-    "mov v23.16b, v21.16b\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "mov v25.16b, v21.16b\n"
-    "mov v24.16b, v21.16b\n"
-    "fmla v15.4s, v26.4s, v9.4s\n"
-    "fmla v0.4s, v26.4s, v12.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v18.4s, v26.4s, v17.4s\n"
-    "fmla v3.4s, v27.4s, v8.4s\n"
-    "ldr q29, [x24, x23]\n"
-    "fmla v15.4s, v27.4s, v11.4s\n"
-    "fmla v2.4s, v27.4s, v9.4s\n"
-    "fmla v0.4s, v27.4s, v13.4s\n"
-    "fmla v16.4s, v27.4s, v12.4s\n"
-    "fmla v1.4s, v27.4s, v10.4s\n"
-    "fmla v18.4s, v27.4s, v14.4s\n"
-    "fmla v19.4s, v27.4s, v17.4s\n"
-    "fmla v23.4s, v27.4s, v20.4s\n"
-    "fmla v6.4s, v29.4s, v8.4s\n"
-    "ldr q28, [x9, x12]\n"
-    "fmla v2.4s, v29.4s, v11.4s\n"
-    "fmla v5.4s, v29.4s, v9.4s\n"
-    "fmla v16.4s, v29.4s, v13.4s\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "fmla v4.4s, v29.4s, v10.4s\n"
-    "fmla v19.4s, v29.4s, v14.4s\n"
-    "fmla v22.4s, v29.4s, v17.4s\n"
-    "fmla v25.4s, v29.4s, v20.4s\n"
-    "fmla v5.4s, v28.4s, v11.4s\n"
-    "ldr q21, [%[inptr0], x14]\n"
-    "fmla v1.4s, v28.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v22.4s, v28.4s, v14.4s\n"
-    "ldr q26, [x27, %[input_col_stride1]]\n"
-    "fmla v0.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x8]\n"
-    "fmla v4.4s, v21.4s, v13.4s\n"
-    "ldr q21, [x10, x28]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr q29, [x26, x23]\n"
-    "fmla v15.4s, v21.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x25]\n"
-    "fmla v0.4s, v21.4s, v11.4s\n"
-    "fmla v16.4s, v21.4s, v9.4s\n"
-    "fmla v18.4s, v21.4s, v12.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v23.4s, v21.4s, v17.4s\n"
-    "ldr q21, [x24, x12]\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v16.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v9.4s\n"
-    "fmla v18.4s, v29.4s, v13.4s\n"
-    "fmla v19.4s, v29.4s, v12.4s\n"
-    "fmla v22.4s, v29.4s, v10.4s\n"
-    "fmla v23.4s, v29.4s, v14.4s\n"
-    "fmla v25.4s, v29.4s, v17.4s\n"
-    "fmla v24.4s, v29.4s, v20.4s\n"
-    "ldr q28, [x9, x14]\n"
-    "fmla v5.4s, v21.4s, v8.4s\n"
-    "ldr q27, [x27, x28]\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v4.4s, v21.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v19.4s, v21.4s, v13.4s\n"
-    "prfm pldl1keep, [x9, x8]\n"
-    "fmla v22.4s, v21.4s, v12.4s\n"
-    "fmla v25.4s, v21.4s, v14.4s\n"
-    "fmla v4.4s, v28.4s, v11.4s\n"
-    "ldr q20, [x10, x23]\n"
-    "fmla v0.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "fmla v22.4s, v28.4s, v13.4s\n"
-    "ldr q26, [x26, x12]\n"
-    "fmla v23.4s, v27.4s, v10.4s\n"
-    "ldr q21, [x24, x14]\n"
-    "fmla v16.4s, v20.4s, v8.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v18.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v19.4s, v20.4s, v9.4s\n"
-    "prfm pldl1keep, [x24, x8]\n"
-    "fmla v23.4s, v20.4s, v12.4s\n"
-    "fmla v25.4s, v20.4s, v10.4s\n"
-    "fmla v24.4s, v20.4s, v17.4s\n"
-    "ldr q28, [x27, x23]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr q20, [x10, x12]\n"
-    "fmla v19.4s, v26.4s, v11.4s\n"
-    "fmla v22.4s, v26.4s, v9.4s\n"
-    "fmla v23.4s, v26.4s, v13.4s\n"
-    "fmla v25.4s, v26.4s, v12.4s\n"
-    "fmla v24.4s, v26.4s, v14.4s\n"
-    "ldr q17, [x26, x14]\n"
-    "fmla v4.4s, v21.4s, v8.4s\n"
-    "ldr q26, [x27, x12]\n"
-    "fmla v22.4s, v21.4s, v11.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v25.4s, v21.4s, v13.4s\n"
-    "ldr q27, [x10, x14]\n"
-    "fmla v18.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v23.4s, v28.4s, v9.4s\n"
-    "add x10, x10, #16\n"
-    "fmla v24.4s, v28.4s, v10.4s\n"
-    "ldr q28, [x27, x14]\n"
-    "fmla v19.4s, v20.4s, v8.4s\n"
-    "ldr q21, [%[wbptr]]\n"
-    "fmla v23.4s, v20.4s, v11.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v25.4s, v20.4s, v9.4s\n"
-    "fmla v24.4s, v20.4s, v12.4s\n"
-    "fmla v22.4s, v17.4s, v8.4s\n"
-    "ldr q20, [%[wbptr], #16]\n"
-    "fmla v23.4s, v26.4s, v8.4s\n"
-    "ldr q14, [%[wbptr], #32]\n"
-    "fmla v24.4s, v17.4s, v13.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v25.4s, v17.4s, v11.4s\n"
-    "ldr q17, [%[wbptr], #64]\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "ldr q13, [%[wbptr], #48]\n"
-    "str q7, [%[outptr0]]\n"
-    "fmla v25.4s, v27.4s, v8.4s\n"
-    "str q6, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "ldr q12, [%[wbptr], #80]\n"
-    "str q5, [%[outptr0], x19]\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "ldr q10, [%[wbptr], #112]\n"
-    "str q4, [%[outptr0], x20]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "str q3, [x16]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr q11, [%[wbptr], #96]\n"
-    "str q2, [x16, %[output_col_stride1]]\n"
-    "fmax v22.4s, v22.4s, v29.4s\n"
-    "str q1, [x16, x19]\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str q22, [x16, x20]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "str q15, [x17]\n"
-    "fmax v19.4s, v19.4s, v29.4s\n"
-    "str q16, [x17, %[output_col_stride1]]\n"
-    "fmax v25.4s, v25.4s, v29.4s\n"
-    "str q19, [x17, x19]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str q25, [x17, x20]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "str q0, [x7]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str q18, [x7, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "str q23, [x7, x19]\n"
-    "mov v7.16b, v21.16b\n"
-    "str q24, [x7, x20]\n"
-    "mov v3.16b, v21.16b\n"
-    "mov v6.16b, v21.16b\n"
-    "ldr q9, [%[wbptr], #128]\n"
-    "mov v15.16b, v21.16b\n"
-    "ldr q8, [%[wbptr], #144]\n"
-    "mov v2.16b, v21.16b\n"
-    "ldr q22, [%[inptr0]]\n"
-    "mov v5.16b, v21.16b\n"
-    "ldr q19, [x9]\n"
-    "mov v0.16b, v21.16b\n"
-    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
-    "mov v16.16b, v21.16b\n"
-    "ldr q18, [x24]\n"
-    "mov v1.16b, v21.16b\n"
-    "ldr q27, [x9, %[input_col_stride1]]\n"
-    "mov v4.16b, v21.16b\n"
-    "ldr q28, [%[inptr0], x28]\n"
-    "fmla v7.4s, v22.4s, v20.4s\n"
-    "ldr q25, [x26]\n"
-    "fmla v3.4s, v19.4s, v20.4s\n"
-    "ldr q22, [x24, %[input_col_stride1]]\n"
-    "fmla v6.4s, v23.4s, v20.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmla v7.4s, v19.4s, v17.4s\n"
-    "add x16, x16, #16\n"
-    "fmla v3.4s, v18.4s, v17.4s\n"
-    "add x17, x17, #16\n"
-    "fmla v15.4s, v18.4s, v20.4s\n"
-    "add x7, x7, #16\n"
-    "fmla v7.4s, v23.4s, v14.4s\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "fmla v7.4s, v18.4s, v10.4s\n"
-    "fmla v7.4s, v27.4s, v12.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "mov v18.16b, v21.16b\n"
-    "ldr q23, [x9, x28]\n"
-    "mov v19.16b, v21.16b\n"
-    "prfm pldl1keep, [x9, x25]\n"
-    "fmla v6.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x11]\n"
-    "fmla v2.4s, v27.4s, v20.4s\n"
-    "ldr q24, [%[inptr0], x23]\n"
-    "fmla v7.4s, v28.4s, v13.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v6.4s, v28.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x8]\n"
-    "fmla v5.4s, v28.4s, v20.4s\n"
-    "ldr q26, [x10]\n"
-    "fmla v3.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x25]\n"
-    "fmla v15.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x9, x11]\n"
-    "fmla v0.4s, v25.4s, v20.4s\n"
-    "ldr q25, [x26, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x13]\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, #64]\n"
-    "fmla v6.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x10, x8]\n"
-    "fmla v15.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x25]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "prfm pldl1keep, [x24, x11]\n"
-    "fmla v16.4s, v22.4s, v20.4s\n"
-    "ldr q22, [x24, x28]\n"
-    "fmla v7.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x13]\n"
-    "fmla v3.4s, v23.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v6.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, x8]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "prfm pldl1keep, [x10, x25]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "prfm pldl1keep, [x26, x11]\n"
-    "fmla v1.4s, v23.4s, v20.4s\n"
-    "ldr q23, [x9, x23]\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x24, x13]\n"
-    "fmla v5.4s, v24.4s, v14.4s\n"
-    "prfm pldl1keep, [x9, x15]\n"
-    "fmla v4.4s, v24.4s, v20.4s\n"
-    "ldr q24, [%[inptr0], x12]\n"
-    "fmla v15.4s, v26.4s, v10.4s\n"
-    "prfm pldl1keep, [x27, x25]\n"
-    "fmla v0.4s, v26.4s, v17.4s\n"
-    "ldr q29, [x27]\n"
-    "fmla v3.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x11]\n"
-    "fmla v15.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x26, x13]\n"
-    "fmla v2.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "prfm pldl1keep, [x27, x11]\n"
-    "fmla v16.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x10, x13]\n"
-    "fmla v18.4s, v25.4s, v20.4s\n"
-    "ldr q26, [x10, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, x15]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x27, x13]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x15]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x27, x15]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v5.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v22.4s, v14.4s\n"
-    "fmla v1.4s, v22.4s, v17.4s\n"
-    "fmla v19.4s, v22.4s, v20.4s\n"
-    "ldr q27, [x26, x28]\n"
-    "fmla v6.4s, v23.4s, v11.4s\n"
-    "fmla v2.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v23.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v14.4s\n"
-    "fmla v4.4s, v23.4s, v17.4s\n"
-    "fmla v0.4s, v29.4s, v10.4s\n"
-    "mov v22.16b, v21.16b\n"
-    "fmla v15.4s, v26.4s, v9.4s\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v22.4s, v23.4s, v20.4s\n"
-    "ldr q29, [x24, x23]\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "ldr q28, [x9, x12]\n"
-    "fmla v0.4s, v26.4s, v12.4s\n"
-    "fmla v18.4s, v26.4s, v17.4s\n"
-    "mov v23.16b, v21.16b\n"
-    "fmla v3.4s, v27.4s, v8.4s\n"
-    "fmla v15.4s, v27.4s, v11.4s\n"
-    "fmla v2.4s, v27.4s, v9.4s\n"
-    "fmla v0.4s, v27.4s, v13.4s\n"
-    "fmla v16.4s, v27.4s, v12.4s\n"
-    "fmla v1.4s, v27.4s, v10.4s\n"
-    "fmla v18.4s, v27.4s, v14.4s\n"
-    "fmla v19.4s, v27.4s, v17.4s\n"
-    "fmla v23.4s, v27.4s, v20.4s\n"
-    "mov v25.16b, v21.16b\n"
-    "mov v24.16b, v21.16b\n"
-    "fmla v6.4s, v29.4s, v8.4s\n"
-    "fmla v2.4s, v29.4s, v11.4s\n"
-    "fmla v5.4s, v29.4s, v9.4s\n"
-    "fmla v16.4s, v29.4s, v13.4s\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "fmla v4.4s, v29.4s, v10.4s\n"
-    "fmla v19.4s, v29.4s, v14.4s\n"
-    "fmla v22.4s, v29.4s, v17.4s\n"
-    "fmla v25.4s, v29.4s, v20.4s\n"
-    "ldr q21, [%[inptr0], x14]\n"
-    "fmla v5.4s, v28.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v1.4s, v28.4s, v13.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v22.4s, v28.4s, v14.4s\n"
-    "ldr q26, [x27, %[input_col_stride1]]\n"
-    "fmla v0.4s, v26.4s, v9.4s\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "fmla v4.4s, v21.4s, v13.4s\n"
-    "ldr q21, [x10, x28]\n"
-    "fmla v15.4s, v21.4s, v8.4s\n"
-    "ldr q29, [x26, x23]\n"
-    "fmla v0.4s, v21.4s, v11.4s\n"
-    "fmla v16.4s, v21.4s, v9.4s\n"
-    "fmla v18.4s, v21.4s, v12.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v23.4s, v21.4s, v17.4s\n"
-    "ldr q21, [x24, x12]\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v16.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v9.4s\n"
-    "fmla v18.4s, v29.4s, v13.4s\n"
-    "fmla v19.4s, v29.4s, v12.4s\n"
-    "fmla v22.4s, v29.4s, v10.4s\n"
-    "fmla v23.4s, v29.4s, v14.4s\n"
-    "fmla v25.4s, v29.4s, v17.4s\n"
-    "fmla v24.4s, v29.4s, v20.4s\n"
-    "ldr q28, [x9, x14]\n"
-    "fmla v5.4s, v21.4s, v8.4s\n"
-    "ldr q27, [x27, x28]\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v4.4s, v21.4s, v9.4s\n"
-    "fmla v19.4s, v21.4s, v13.4s\n"
-    "fmla v22.4s, v21.4s, v12.4s\n"
-    "fmla v25.4s, v21.4s, v14.4s\n"
-    "fmla v0.4s, v27.4s, v8.4s\n"
-    "ldr q20, [x10, x23]\n"
-    "fmla v4.4s, v28.4s, v11.4s\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "fmla v22.4s, v28.4s, v13.4s\n"
-    "ldr q26, [x26, x12]\n"
-    "fmla v23.4s, v27.4s, v10.4s\n"
-    "ldr q21, [x24, x14]\n"
-    "fmla v16.4s, v20.4s, v8.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v18.4s, v20.4s, v11.4s\n"
-    "fmla v19.4s, v20.4s, v9.4s\n"
-    "fmla v23.4s, v20.4s, v12.4s\n"
-    "fmla v25.4s, v20.4s, v10.4s\n"
-    "fmla v24.4s, v20.4s, v17.4s\n"
-    "ldr q28, [x27, x23]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr q20, [x10, x12]\n"
-    "fmla v19.4s, v26.4s, v11.4s\n"
-    "fmla v22.4s, v26.4s, v9.4s\n"
-    "fmla v23.4s, v26.4s, v13.4s\n"
-    "fmla v25.4s, v26.4s, v12.4s\n"
-    "fmla v24.4s, v26.4s, v14.4s\n"
-    "ldr q17, [x26, x14]\n"
-    "fmla v4.4s, v21.4s, v8.4s\n"
-    "ldr q26, [x27, x12]\n"
-    "fmla v22.4s, v21.4s, v11.4s\n"
-    "add x26, x26, #16\n"
-    "fmla v25.4s, v21.4s, v13.4s\n"
-    "ldr q27, [x10, x14]\n"
-    "fmla v18.4s, v28.4s, v8.4s\n"
-    "add x10, x10, #16\n"
-    "fmla v23.4s, v28.4s, v9.4s\n"
-    "fmla v24.4s, v28.4s, v10.4s\n"
-    "fmla v19.4s, v20.4s, v8.4s\n"
-    "ldr q28, [x27, x14]\n"
-    "fmla v25.4s, v20.4s, v9.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v23.4s, v20.4s, v11.4s\n"
-    "fmla v24.4s, v20.4s, v12.4s\n"
-    "fmla v22.4s, v17.4s, v8.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v25.4s, v17.4s, v11.4s\n"
-    "fmla v24.4s, v17.4s, v13.4s\n"
-    "fmla v23.4s, v26.4s, v8.4s\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmla v25.4s, v27.4s, v8.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "str q7, [%[outptr0]]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "str q6, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "str q5, [%[outptr0], x19]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "str q4, [%[outptr0], x20]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "str q3, [x16]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "str q2, [x16, %[output_col_stride1]]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "str q1, [x16, x19]\n"
-    "fmax v22.4s, v22.4s, v29.4s\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "str q22, [x16, x20]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "str q15, [x17]\n"
-    "fmax v19.4s, v19.4s, v29.4s\n"
-    "str q16, [x17, %[output_col_stride1]]\n"
-    "fmax v25.4s, v25.4s, v29.4s\n"
-    "str q19, [x17, x19]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str q25, [x17, x20]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "str q0, [x7]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str q18, [x7, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "str q23, [x7, x19]\n"
-    "add x16, x16, #16\n"
-    "str q24, [x7, x20]\n"
-    "add x17, x17, #16\n"
-    "add x7, x7, #16\n"
-    "4:\n"
-    "cbz x21, 7f\n"
-    "ldr s21, [%[wbptr]]\n"
-    "mov v7.16b, v21.16b\n"
-    "ldr s20, [%[wbptr], #4]\n"
-    "mov v3.16b, v21.16b\n"
-    "ldr s14, [%[wbptr], #8]\n"
-    "mov v6.16b, v21.16b\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "mov v15.16b, v21.16b\n"
-    "ldr s17, [%[wbptr], #16]\n"
-    "mov v2.16b, v21.16b\n"
-    "ldr s12, [%[wbptr], #20]\n"
-    "mov v5.16b, v21.16b\n"
-    "ldr s11, [%[wbptr], #24]\n"
-    "mov v0.16b, v21.16b\n"
-    "ldr s10, [%[wbptr], #28]\n"
-    "mov v16.16b, v21.16b\n"
-    "ldr s9, [%[wbptr], #32]\n"
-    "mov v1.16b, v21.16b\n"
-    "ldr s8, [%[wbptr], #36]\n"
-    "mov v4.16b, v21.16b\n"
-    "ldr s22, [%[inptr0]]\n"
-    "fmla v7.4s, v22.4s, v20.4s\n"
-    "ldr s19, [x9]\n"
-    "fmla v3.4s, v19.4s, v20.4s\n"
-    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v6.4s, v23.4s, v20.4s\n"
-    "ldr s18, [x24]\n"
-    "fmla v7.4s, v19.4s, v17.4s\n"
-    "ldr s27, [x9, %[input_col_stride1]]\n"
-    "fmla v3.4s, v18.4s, v17.4s\n"
-    "ldr s28, [%[inptr0], x28]\n"
-    "fmla v15.4s, v18.4s, v20.4s\n"
-    "ldr s25, [x26]\n"
-    "fmla v7.4s, v23.4s, v14.4s\n"
-    "ldr s22, [x24, %[input_col_stride1]]\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "subs x21, x21, #1\n"
-    "prfm pldl1keep, [%[inptr0], x8]\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v7.4s, v18.4s, v10.4s\n"
-    "prfm pldl1keep, [x9, x8]\n"
-    "prfm pldl1keep, [%[inptr0], x25]\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "prfm pldl1keep, [x24, x8]\n"
-    "fmla v7.4s, v27.4s, v12.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "mov v18.16b, v21.16b\n"
-    "ldr s23, [x9, x28]\n"
-    "mov v19.16b, v21.16b\n"
-    "prfm pldl1keep, [x9, x25]\n"
-    "fmla v6.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x11]\n"
-    "fmla v2.4s, v27.4s, v20.4s\n"
-    "ldr s24, [%[inptr0], x23]\n"
-    "fmla v7.4s, v28.4s, v13.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v6.4s, v28.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x8]\n"
-    "fmla v5.4s, v28.4s, v20.4s\n"
-    "ldr s26, [x10]\n"
-    "fmla v3.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x25]\n"
-    "fmla v15.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x9, x11]\n"
-    "fmla v0.4s, v25.4s, v20.4s\n"
-    "ldr s25, [x26, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x13]\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, #64]\n"
-    "fmla v6.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x10, x8]\n"
-    "fmla v15.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x25]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "prfm pldl1keep, [x24, x11]\n"
-    "fmla v16.4s, v22.4s, v20.4s\n"
-    "ldr s22, [x24, x28]\n"
-    "fmla v7.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x13]\n"
-    "fmla v3.4s, v23.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v6.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, x8]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "prfm pldl1keep, [x10, x25]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "prfm pldl1keep, [x26, x11]\n"
-    "fmla v1.4s, v23.4s, v20.4s\n"
-    "ldr s23, [x9, x23]\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x24, x13]\n"
-    "fmla v5.4s, v24.4s, v14.4s\n"
-    "prfm pldl1keep, [x9, x15]\n"
-    "fmla v4.4s, v24.4s, v20.4s\n"
-    "ldr s24, [%[inptr0], x12]\n"
-    "fmla v15.4s, v26.4s, v10.4s\n"
-    "prfm pldl1keep, [x27, x25]\n"
-    "fmla v0.4s, v26.4s, v17.4s\n"
-    "ldr s29, [x27]\n"
-    "fmla v3.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x11]\n"
-    "fmla v15.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x26, x13]\n"
-    "fmla v2.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "prfm pldl1keep, [x27, x11]\n"
-    "fmla v16.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x10, x13]\n"
-    "fmla v18.4s, v25.4s, v20.4s\n"
-    "ldr s26, [x10, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, x15]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x27, x13]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x15]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x27, x15]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v5.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v22.4s, v14.4s\n"
-    "subs x21, x21, #1\n"
-    "fmla v1.4s, v22.4s, v17.4s\n"
-    "fmla v19.4s, v22.4s, v20.4s\n"
-    "mov v22.16b, v21.16b\n"
-    "fmla v6.4s, v23.4s, v11.4s\n"
-    "fmla v2.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v23.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v14.4s\n"
-    "fmla v4.4s, v23.4s, v17.4s\n"
-    "fmla v22.4s, v23.4s, v20.4s\n"
-    "ldr s27, [x26, x28]\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "fmla v0.4s, v29.4s, v10.4s\n"
-    "mov v23.16b, v21.16b\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "mov v25.16b, v21.16b\n"
-    "mov v24.16b, v21.16b\n"
-    "fmla v15.4s, v26.4s, v9.4s\n"
-    "fmla v0.4s, v26.4s, v12.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v18.4s, v26.4s, v17.4s\n"
-    "fmla v3.4s, v27.4s, v8.4s\n"
-    "ldr s29, [x24, x23]\n"
-    "fmla v15.4s, v27.4s, v11.4s\n"
-    "fmla v2.4s, v27.4s, v9.4s\n"
-    "fmla v0.4s, v27.4s, v13.4s\n"
-    "fmla v16.4s, v27.4s, v12.4s\n"
-    "fmla v1.4s, v27.4s, v10.4s\n"
-    "fmla v18.4s, v27.4s, v14.4s\n"
-    "fmla v19.4s, v27.4s, v17.4s\n"
-    "fmla v23.4s, v27.4s, v20.4s\n"
-    "fmla v6.4s, v29.4s, v8.4s\n"
-    "ldr s28, [x9, x12]\n"
-    "fmla v2.4s, v29.4s, v11.4s\n"
-    "fmla v5.4s, v29.4s, v9.4s\n"
-    "fmla v16.4s, v29.4s, v13.4s\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "fmla v4.4s, v29.4s, v10.4s\n"
-    "fmla v19.4s, v29.4s, v14.4s\n"
-    "fmla v22.4s, v29.4s, v17.4s\n"
-    "fmla v25.4s, v29.4s, v20.4s\n"
-    "fmla v5.4s, v28.4s, v11.4s\n"
-    "ldr s21, [%[inptr0], x14]\n"
-    "fmla v1.4s, v28.4s, v13.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v22.4s, v28.4s, v14.4s\n"
-    "ldr s26, [x27, %[input_col_stride1]]\n"
-    "fmla v0.4s, v26.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x8]\n"
-    "fmla v4.4s, v21.4s, v13.4s\n"
-    "ldr s21, [x10, x28]\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "ldr s29, [x26, x23]\n"
-    "fmla v15.4s, v21.4s, v8.4s\n"
-    "prfm pldl1keep, [%[inptr0], x25]\n"
-    "fmla v0.4s, v21.4s, v11.4s\n"
-    "fmla v16.4s, v21.4s, v9.4s\n"
-    "fmla v18.4s, v21.4s, v12.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v23.4s, v21.4s, v17.4s\n"
-    "ldr s21, [x24, x12]\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v16.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v9.4s\n"
-    "fmla v18.4s, v29.4s, v13.4s\n"
-    "fmla v19.4s, v29.4s, v12.4s\n"
-    "fmla v22.4s, v29.4s, v10.4s\n"
-    "fmla v23.4s, v29.4s, v14.4s\n"
-    "fmla v25.4s, v29.4s, v17.4s\n"
-    "fmla v24.4s, v29.4s, v20.4s\n"
-    "ldr s28, [x9, x14]\n"
-    "fmla v5.4s, v21.4s, v8.4s\n"
-    "ldr s27, [x27, x28]\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v4.4s, v21.4s, v9.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v19.4s, v21.4s, v13.4s\n"
-    "prfm pldl1keep, [x9, x8]\n"
-    "fmla v22.4s, v21.4s, v12.4s\n"
-    "fmla v25.4s, v21.4s, v14.4s\n"
-    "fmla v4.4s, v28.4s, v11.4s\n"
-    "ldr s20, [x10, x23]\n"
-    "fmla v0.4s, v27.4s, v8.4s\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "fmla v22.4s, v28.4s, v13.4s\n"
-    "ldr s26, [x26, x12]\n"
-    "fmla v23.4s, v27.4s, v10.4s\n"
-    "ldr s21, [x24, x14]\n"
-    "fmla v16.4s, v20.4s, v8.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v18.4s, v20.4s, v11.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v19.4s, v20.4s, v9.4s\n"
-    "prfm pldl1keep, [x24, x8]\n"
-    "fmla v23.4s, v20.4s, v12.4s\n"
-    "fmla v25.4s, v20.4s, v10.4s\n"
-    "fmla v24.4s, v20.4s, v17.4s\n"
-    "ldr s28, [x27, x23]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr s20, [x10, x12]\n"
-    "fmla v19.4s, v26.4s, v11.4s\n"
-    "fmla v22.4s, v26.4s, v9.4s\n"
-    "fmla v23.4s, v26.4s, v13.4s\n"
-    "fmla v25.4s, v26.4s, v12.4s\n"
-    "fmla v24.4s, v26.4s, v14.4s\n"
-    "ldr s17, [x26, x14]\n"
-    "fmla v4.4s, v21.4s, v8.4s\n"
-    "ldr s26, [x27, x12]\n"
-    "fmla v22.4s, v21.4s, v11.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v25.4s, v21.4s, v13.4s\n"
-    "ldr s27, [x10, x14]\n"
-    "fmla v18.4s, v28.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, #64]\n"
-    "fmla v23.4s, v28.4s, v9.4s\n"
-    "add x10, x10, #4\n"
-    "fmla v24.4s, v28.4s, v10.4s\n"
-    "ldr s28, [x27, x14]\n"
-    "fmla v19.4s, v20.4s, v8.4s\n"
-    "ldr s21, [%[wbptr]]\n"
-    "fmla v23.4s, v20.4s, v11.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v25.4s, v20.4s, v9.4s\n"
-    "fmla v24.4s, v20.4s, v12.4s\n"
-    "fmla v22.4s, v17.4s, v8.4s\n"
-    "ldr s20, [%[wbptr], #4]\n"
-    "fmla v23.4s, v26.4s, v8.4s\n"
-    "ldr s14, [%[wbptr], #8]\n"
-    "fmla v24.4s, v17.4s, v13.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v25.4s, v17.4s, v11.4s\n"
-    "ldr s17, [%[wbptr], #16]\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "ldr s13, [%[wbptr], #12]\n"
-    "str s7, [%[outptr0]]\n"
-    "fmla v25.4s, v27.4s, v8.4s\n"
-    "str s6, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "ldr s12, [%[wbptr], #20]\n"
-    "str s5, [%[outptr0], x19]\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "ldr s10, [%[wbptr], #28]\n"
-    "str s4, [%[outptr0], x20]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "str s3, [x16]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr s11, [%[wbptr], #24]\n"
-    "str s2, [x16, %[output_col_stride1]]\n"
-    "fmax v22.4s, v22.4s, v29.4s\n"
-    "str s1, [x16, x19]\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str s22, [x16, x20]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "str s15, [x17]\n"
-    "fmax v19.4s, v19.4s, v29.4s\n"
-    "str s16, [x17, %[output_col_stride1]]\n"
-    "fmax v25.4s, v25.4s, v29.4s\n"
-    "str s19, [x17, x19]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str s25, [x17, x20]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "str s0, [x7]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str s18, [x7, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "str s23, [x7, x19]\n"
-    "mov v7.16b, v21.16b\n"
-    "str s24, [x7, x20]\n"
-    "mov v3.16b, v21.16b\n"
-    "mov v6.16b, v21.16b\n"
-    "ldr s9, [%[wbptr], #32]\n"
-    "mov v15.16b, v21.16b\n"
-    "ldr s8, [%[wbptr], #36]\n"
-    "mov v2.16b, v21.16b\n"
-    "ldr s22, [%[inptr0]]\n"
-    "mov v5.16b, v21.16b\n"
-    "ldr s19, [x9]\n"
-    "mov v0.16b, v21.16b\n"
-    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
-    "mov v16.16b, v21.16b\n"
-    "ldr s18, [x24]\n"
-    "mov v1.16b, v21.16b\n"
-    "ldr s27, [x9, %[input_col_stride1]]\n"
-    "mov v4.16b, v21.16b\n"
-    "ldr s28, [%[inptr0], x28]\n"
-    "fmla v7.4s, v22.4s, v20.4s\n"
-    "ldr s25, [x26]\n"
-    "fmla v3.4s, v19.4s, v20.4s\n"
-    "ldr s22, [x24, %[input_col_stride1]]\n"
-    "fmla v6.4s, v23.4s, v20.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmla v7.4s, v19.4s, v17.4s\n"
-    "add x16, x16, #4\n"
-    "fmla v3.4s, v18.4s, v17.4s\n"
-    "add x17, x17, #4\n"
-    "fmla v15.4s, v18.4s, v20.4s\n"
-    "add x7, x7, #4\n"
-    "fmla v7.4s, v23.4s, v14.4s\n"
-    "fmla v3.4s, v27.4s, v14.4s\n"
-    "fmla v7.4s, v18.4s, v10.4s\n"
-    "fmla v7.4s, v27.4s, v12.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "mov v18.16b, v21.16b\n"
-    "ldr s23, [x9, x28]\n"
-    "mov v19.16b, v21.16b\n"
-    "prfm pldl1keep, [x9, x25]\n"
-    "fmla v6.4s, v27.4s, v17.4s\n"
-    "prfm pldl1keep, [%[inptr0], x11]\n"
-    "fmla v2.4s, v27.4s, v20.4s\n"
-    "ldr s24, [%[inptr0], x23]\n"
-    "fmla v7.4s, v28.4s, v13.4s\n"
-    "prfm pldl1keep, [x10, #64]\n"
-    "fmla v6.4s, v28.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x8]\n"
-    "fmla v5.4s, v28.4s, v20.4s\n"
-    "ldr s26, [x10]\n"
-    "fmla v3.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x25]\n"
-    "fmla v15.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x9, x11]\n"
-    "fmla v0.4s, v25.4s, v20.4s\n"
-    "ldr s25, [x26, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [%[inptr0], x13]\n"
-    "fmla v3.4s, v22.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, #64]\n"
-    "fmla v6.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [x10, x8]\n"
-    "fmla v15.4s, v22.4s, v14.4s\n"
-    "prfm pldl1keep, [x26, x25]\n"
-    "fmla v2.4s, v22.4s, v17.4s\n"
-    "prfm pldl1keep, [x24, x11]\n"
-    "fmla v16.4s, v22.4s, v20.4s\n"
-    "ldr s22, [x24, x28]\n"
-    "fmla v7.4s, v23.4s, v11.4s\n"
-    "prfm pldl1keep, [x9, x13]\n"
-    "fmla v3.4s, v23.4s, v13.4s\n"
-    "prfm pldl1keep, [%[inptr0], x15]\n"
-    "fmla v6.4s, v23.4s, v12.4s\n"
-    "prfm pldl1keep, [x27, x8]\n"
-    "fmla v2.4s, v23.4s, v14.4s\n"
-    "prfm pldl1keep, [x10, x25]\n"
-    "fmla v5.4s, v23.4s, v17.4s\n"
-    "prfm pldl1keep, [x26, x11]\n"
-    "fmla v1.4s, v23.4s, v20.4s\n"
-    "ldr s23, [x9, x23]\n"
-    "fmla v6.4s, v24.4s, v13.4s\n"
-    "prfm pldl1keep, [x24, x13]\n"
-    "fmla v5.4s, v24.4s, v14.4s\n"
-    "prfm pldl1keep, [x9, x15]\n"
-    "fmla v4.4s, v24.4s, v20.4s\n"
-    "ldr s24, [%[inptr0], x12]\n"
-    "fmla v15.4s, v26.4s, v10.4s\n"
-    "prfm pldl1keep, [x27, x25]\n"
-    "fmla v0.4s, v26.4s, v17.4s\n"
-    "ldr s29, [x27]\n"
-    "fmla v3.4s, v25.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x11]\n"
-    "fmla v15.4s, v25.4s, v12.4s\n"
-    "prfm pldl1keep, [x26, x13]\n"
-    "fmla v2.4s, v25.4s, v10.4s\n"
-    "prfm pldl1keep, [x24, x15]\n"
-    "fmla v0.4s, v25.4s, v14.4s\n"
-    "prfm pldl1keep, [x27, x11]\n"
-    "fmla v16.4s, v25.4s, v17.4s\n"
-    "prfm pldl1keep, [x10, x13]\n"
-    "fmla v18.4s, v25.4s, v20.4s\n"
-    "ldr s26, [x10, %[input_col_stride1]]\n"
-    "fmla v7.4s, v22.4s, v8.4s\n"
-    "prfm pldl1keep, [x26, x15]\n"
-    "fmla v3.4s, v22.4s, v11.4s\n"
-    "prfm pldl1keep, [x27, x13]\n"
-    "fmla v6.4s, v22.4s, v9.4s\n"
-    "prfm pldl1keep, [x10, x15]\n"
-    "fmla v15.4s, v22.4s, v13.4s\n"
-    "prfm pldl1keep, [x27, x15]\n"
-    "fmla v2.4s, v22.4s, v12.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v5.4s, v22.4s, v10.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v22.4s, v14.4s\n"
-    "fmla v1.4s, v22.4s, v17.4s\n"
-    "fmla v19.4s, v22.4s, v20.4s\n"
-    "ldr s27, [x26, x28]\n"
-    "fmla v6.4s, v23.4s, v11.4s\n"
-    "fmla v2.4s, v23.4s, v13.4s\n"
-    "fmla v5.4s, v23.4s, v12.4s\n"
-    "fmla v1.4s, v23.4s, v14.4s\n"
-    "fmla v4.4s, v23.4s, v17.4s\n"
-    "fmla v0.4s, v29.4s, v10.4s\n"
-    "mov v22.16b, v21.16b\n"
-    "fmla v15.4s, v26.4s, v9.4s\n"
-    "fmla v5.4s, v24.4s, v13.4s\n"
-    "fmla v16.4s, v26.4s, v10.4s\n"
-    "fmla v22.4s, v23.4s, v20.4s\n"
-    "ldr s29, [x24, x23]\n"
-    "fmla v4.4s, v24.4s, v14.4s\n"
-    "ldr s28, [x9, x12]\n"
-    "fmla v0.4s, v26.4s, v12.4s\n"
-    "fmla v18.4s, v26.4s, v17.4s\n"
-    "mov v23.16b, v21.16b\n"
-    "fmla v3.4s, v27.4s, v8.4s\n"
-    "fmla v15.4s, v27.4s, v11.4s\n"
-    "fmla v2.4s, v27.4s, v9.4s\n"
-    "fmla v0.4s, v27.4s, v13.4s\n"
-    "fmla v16.4s, v27.4s, v12.4s\n"
-    "fmla v1.4s, v27.4s, v10.4s\n"
-    "fmla v18.4s, v27.4s, v14.4s\n"
-    "fmla v19.4s, v27.4s, v17.4s\n"
-    "fmla v23.4s, v27.4s, v20.4s\n"
-    "mov v25.16b, v21.16b\n"
-    "mov v24.16b, v21.16b\n"
-    "fmla v6.4s, v29.4s, v8.4s\n"
-    "fmla v2.4s, v29.4s, v11.4s\n"
-    "fmla v5.4s, v29.4s, v9.4s\n"
-    "fmla v16.4s, v29.4s, v13.4s\n"
-    "fmla v1.4s, v29.4s, v12.4s\n"
-    "fmla v4.4s, v29.4s, v10.4s\n"
-    "fmla v19.4s, v29.4s, v14.4s\n"
-    "fmla v22.4s, v29.4s, v17.4s\n"
-    "fmla v25.4s, v29.4s, v20.4s\n"
-    "ldr s21, [%[inptr0], x14]\n"
-    "fmla v5.4s, v28.4s, v11.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v1.4s, v28.4s, v13.4s\n"
-    "fmla v4.4s, v28.4s, v12.4s\n"
-    "fmla v22.4s, v28.4s, v14.4s\n"
-    "ldr s26, [x27, %[input_col_stride1]]\n"
-    "fmla v0.4s, v26.4s, v9.4s\n"
-    "fmla v18.4s, v26.4s, v10.4s\n"
-    "fmla v4.4s, v21.4s, v13.4s\n"
-    "ldr s21, [x10, x28]\n"
-    "fmla v15.4s, v21.4s, v8.4s\n"
-    "ldr s29, [x26, x23]\n"
-    "fmla v0.4s, v21.4s, v11.4s\n"
-    "fmla v16.4s, v21.4s, v9.4s\n"
-    "fmla v18.4s, v21.4s, v12.4s\n"
-    "fmla v19.4s, v21.4s, v10.4s\n"
-    "fmla v23.4s, v21.4s, v17.4s\n"
-    "ldr s21, [x24, x12]\n"
-    "fmla v2.4s, v29.4s, v8.4s\n"
-    "fmla v16.4s, v29.4s, v11.4s\n"
-    "fmla v1.4s, v29.4s, v9.4s\n"
-    "fmla v18.4s, v29.4s, v13.4s\n"
-    "fmla v19.4s, v29.4s, v12.4s\n"
-    "fmla v22.4s, v29.4s, v10.4s\n"
-    "fmla v23.4s, v29.4s, v14.4s\n"
-    "fmla v25.4s, v29.4s, v17.4s\n"
-    "fmla v24.4s, v29.4s, v20.4s\n"
-    "ldr s28, [x9, x14]\n"
-    "fmla v5.4s, v21.4s, v8.4s\n"
-    "ldr s27, [x27, x28]\n"
-    "fmla v1.4s, v21.4s, v11.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v4.4s, v21.4s, v9.4s\n"
-    "fmla v19.4s, v21.4s, v13.4s\n"
-    "fmla v22.4s, v21.4s, v12.4s\n"
-    "fmla v25.4s, v21.4s, v14.4s\n"
-    "fmla v0.4s, v27.4s, v8.4s\n"
-    "ldr s20, [x10, x23]\n"
-    "fmla v4.4s, v28.4s, v11.4s\n"
-    "fmla v18.4s, v27.4s, v9.4s\n"
-    "fmla v22.4s, v28.4s, v13.4s\n"
-    "ldr s26, [x26, x12]\n"
-    "fmla v23.4s, v27.4s, v10.4s\n"
-    "ldr s21, [x24, x14]\n"
-    "fmla v16.4s, v20.4s, v8.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v18.4s, v20.4s, v11.4s\n"
-    "fmla v19.4s, v20.4s, v9.4s\n"
-    "fmla v23.4s, v20.4s, v12.4s\n"
-    "fmla v25.4s, v20.4s, v10.4s\n"
-    "fmla v24.4s, v20.4s, v17.4s\n"
-    "ldr s28, [x27, x23]\n"
-    "fmla v1.4s, v26.4s, v8.4s\n"
-    "ldr s20, [x10, x12]\n"
-    "fmla v19.4s, v26.4s, v11.4s\n"
-    "fmla v22.4s, v26.4s, v9.4s\n"
-    "fmla v23.4s, v26.4s, v13.4s\n"
-    "fmla v25.4s, v26.4s, v12.4s\n"
-    "fmla v24.4s, v26.4s, v14.4s\n"
-    "ldr s17, [x26, x14]\n"
-    "fmla v4.4s, v21.4s, v8.4s\n"
-    "ldr s26, [x27, x12]\n"
-    "fmla v22.4s, v21.4s, v11.4s\n"
-    "add x26, x26, #4\n"
-    "fmla v25.4s, v21.4s, v13.4s\n"
-    "ldr s27, [x10, x14]\n"
-    "fmla v18.4s, v28.4s, v8.4s\n"
-    "add x10, x10, #4\n"
-    "fmla v23.4s, v28.4s, v9.4s\n"
-    "fmla v24.4s, v28.4s, v10.4s\n"
-    "fmla v19.4s, v20.4s, v8.4s\n"
-    "ldr s28, [x27, x14]\n"
-    "fmla v25.4s, v20.4s, v9.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v23.4s, v20.4s, v11.4s\n"
-    "fmla v24.4s, v20.4s, v12.4s\n"
-    "fmla v22.4s, v17.4s, v8.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v25.4s, v17.4s, v11.4s\n"
-    "fmla v24.4s, v17.4s, v13.4s\n"
-    "fmla v23.4s, v26.4s, v8.4s\n"
-    "fmax v7.4s, v7.4s, v29.4s\n"
-    "fmla v25.4s, v27.4s, v8.4s\n"
-    "fmax v6.4s, v6.4s, v29.4s\n"
-    "str s7, [%[outptr0]]\n"
-    "fmla v24.4s, v26.4s, v9.4s\n"
-    "str s6, [%[outptr0], %[output_col_stride1]]\n"
-    "fmax v5.4s, v5.4s, v29.4s\n"
-    "fmax v4.4s, v4.4s, v29.4s\n"
-    "fmax v3.4s, v3.4s, v29.4s\n"
-    "str s5, [%[outptr0], x19]\n"
-    "fmla v24.4s, v27.4s, v11.4s\n"
-    "str s4, [%[outptr0], x20]\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "str s3, [x16]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "str s2, [x16, %[output_col_stride1]]\n"
-    "fmla v24.4s, v28.4s, v8.4s\n"
-    "str s1, [x16, x19]\n"
-    "fmax v22.4s, v22.4s, v29.4s\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "str s22, [x16, x20]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "str s15, [x17]\n"
-    "fmax v19.4s, v19.4s, v29.4s\n"
-    "str s16, [x17, %[output_col_stride1]]\n"
-    "fmax v25.4s, v25.4s, v29.4s\n"
-    "str s19, [x17, x19]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str s25, [x17, x20]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "str s0, [x7]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str s18, [x7, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "str s23, [x7, x19]\n"
-    "add x16, x16, #4\n"
-    "str s24, [x7, x20]\n"
-    "add x17, x17, #4\n"
-    "add x7, x7, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *inptrs[6][6],
-  float *outptrs[4][4]
-)
-{
-  __asm __volatile(
-    "mov x27, xzr\n"
-    "mov x28, xzr\n"
-    "and x19, %[n_channels], #3\n"
-    "lsr x26, %[n_channels], #2\n"
-    "cbz x26, 4f\n"
-    "1:\n"
-    "ldr q25, [%[wbptr]]\n"
-    "ldr x25, [%[inptrs], 0]\n"
-    "mov v2.16b, v25.16b\n"
-    "ldr q22, [%[wbptr], #16]\n"
-    "mov v16.16b, v25.16b\n"
-    "ldr q9, [%[wbptr], #32]\n"
-    "mov v18.16b, v25.16b\n"
-    "ldr q8, [%[wbptr], #48]\n"
-    "mov v13.16b, v25.16b\n"
-    "ldr q19, [%[wbptr], #64]\n"
-    "mov v0.16b, v25.16b\n"
-    "ldr q7, [%[wbptr], #80]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr q6, [%[wbptr], #96]\n"
-    "mov v14.16b, v25.16b\n"
-    "ldr q5, [%[wbptr], #112]\n"
-    "mov v12.16b, v25.16b\n"
-    "ldr q4, [%[wbptr], #128]\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr q3, [%[wbptr], #144]\n"
-    "ldr q27, [x25, x27]\n"
-    "ldr x17, [%[inptrs], 48]\n"
-    "fmla v2.4s, v27.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 8]\n"
-    "ldr q26, [x17, x27]\n"
-    "ldr x24, [%[inptrs], 96]\n"
-    "fmla v16.4s, v26.4s, v22.4s\n"
-    "ldr q31, [x25, x27]\n"
-    "ldr q28, [x24, x27]\n"
-    "ldr x17, [%[inptrs], 56]\n"
-    "fmla v2.4s, v26.4s, v19.4s\n"
-    "ldr x25, [%[inptrs], 16]\n"
-    "ldr q29, [x17, x27]\n"
-    "ldr x7, [%[inptrs], 144]\n"
-    "ldr x24, [%[inptrs], 104]\n"
-    "subs x26, x26, #1\n"
-    "ldr q30, [x25, x27]\n"
-    "ldr q27, [x7, x27]\n"
-    "ldr q21, [x24, x27]\n"
-    "fmla v2.4s, v31.4s, v9.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "mov v1.16b, v25.16b\n"
-    "ldr x17, [%[inptrs], 64]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr x25, [%[inptrs], 24]\n"
-    "fmla v18.4s, v31.4s, v22.4s\n"
-    "ldr q23, [x17, x27]\n"
-    "fmla v2.4s, v28.4s, v5.4s\n"
-    "ldr x15, [%[inptrs], 192]\n"
-    "fmla v16.4s, v28.4s, v19.4s\n"
-    "ldr x7, [%[inptrs], 152]\n"
-    "fmla v13.4s, v28.4s, v22.4s\n"
-    "ldr q26, [x25, x27]\n"
-    "fmla v18.4s, v29.4s, v19.4s\n"
-    "ldr x24, [%[inptrs], 112]\n"
-    "fmla v2.4s, v29.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 72]\n"
-    "fmla v16.4s, v29.4s, v9.4s\n"
-    "ldr x25, [%[inptrs], 32]\n"
-    "fmla v0.4s, v29.4s, v22.4s\n"
-    "ldr q28, [x15, x27]\n"
-    "fmla v18.4s, v30.4s, v9.4s\n"
-    "ldr x16, [%[inptrs], 240]\n"
-    "fmla v2.4s, v30.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 200]\n"
-    "fmla v17.4s, v30.4s, v22.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v16.4s, v27.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 160]\n"
-    "fmla v13.4s, v27.4s, v19.4s\n"
-    "ldr x20, [%[outptrs], 0]\n"
-    "fmla v14.4s, v27.4s, v22.4s\n"
-    "ldr q20, [x24, x27]\n"
-    "fmla v2.4s, v21.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 120]\n"
-    "fmla v16.4s, v21.4s, v7.4s\n"
-    "ldr x21, [%[outptrs], 32]\n"
-    "fmla v18.4s, v21.4s, v5.4s\n"
-    "ldr x22, [%[outptrs], 64]\n"
-    "fmla v13.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 96]\n"
-    "fmla v0.4s, v21.4s, v19.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v12.4s, v21.4s, v22.4s\n"
-    "ldr q24, [x17, x27]\n"
-    "fmla v2.4s, v23.4s, v6.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v23.4s, v8.4s\n"
-    "ldr x17, [%[inptrs], 80]\n"
-    "fmla v18.4s, v23.4s, v7.4s\n"
-    "subs x26, x26, #1\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "fmla v17.4s, v23.4s, v19.4s\n"
-    "fmla v15.4s, v23.4s, v22.4s\n"
-    "ldr q23, [x25, x27]\n"
-    "fmla v1.4s, v26.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 40]\n"
-    "fmla v18.4s, v26.4s, v8.4s\n"
-    "fmla v13.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v26.4s, v9.4s\n"
-    "ldr q30, [x16, x27]\n"
-    "fmla v14.4s, v28.4s, v19.4s\n"
-    "ldr q26, [x15, x27]\n"
-    "fmla v16.4s, v29.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 248]\n"
-    "fmla v13.4s, v29.4s, v7.4s\n"
-    "ldr x15, [%[inptrs], 208]\n"
-    "fmla v0.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v19.4s\n"
-    "fmla v14.4s, v29.4s, v9.4s\n"
-    "fmla v10.4s, v29.4s, v22.4s\n"
-    "mov v11.16b, v25.16b\n"
-    "fmla v2.4s, v20.4s, v3.4s\n"
-    "fmla v16.4s, v20.4s, v6.4s\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v13.4s, v20.4s, v8.4s\n"
-    "fmla v0.4s, v20.4s, v7.4s\n"
-    "fmla v17.4s, v20.4s, v5.4s\n"
-    "fmla v12.4s, v20.4s, v9.4s\n"
-    "fmla v15.4s, v20.4s, v19.4s\n"
-    "fmla v11.4s, v20.4s, v22.4s\n"
-    "mov v21.16b, v25.16b\n"
-    "fmla v18.4s, v24.4s, v6.4s\n"
-    "fmla v0.4s, v24.4s, v8.4s\n"
-    "fmla v1.4s, v24.4s, v19.4s\n"
-    "fmla v17.4s, v24.4s, v7.4s\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "mov v20.16b, v25.16b\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "fmla v21.4s, v24.4s, v22.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 168]\n"
-    "fmla v17.4s, v23.4s, v8.4s\n"
-    "ldr q30, [x24, x27]\n"
-    "fmla v13.4s, v26.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 128]\n"
-    "fmla v14.4s, v26.4s, v7.4s\n"
-    "fmla v12.4s, v26.4s, v5.4s\n"
-    "fmla v10.4s, v26.4s, v19.4s\n"
-    "ldr q31, [x17, x27]\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "ldr x17, [%[inptrs], 88]\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v0.4s, v27.4s, v4.4s\n"
-    "fmla v14.4s, v27.4s, v8.4s\n"
-    "fmla v12.4s, v27.4s, v7.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "fmla v10.4s, v27.4s, v9.4s\n"
-    "fmla v11.4s, v27.4s, v19.4s\n"
-    "fmla v20.4s, v27.4s, v22.4s\n"
-    "mov v24.16b, v25.16b\n"
-    "mov v23.16b, v25.16b\n"
-    "fmla v18.4s, v30.4s, v3.4s\n"
-    "fmla v0.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "fmla v12.4s, v30.4s, v8.4s\n"
-    "fmla v15.4s, v30.4s, v7.4s\n"
-    "fmla v1.4s, v30.4s, v5.4s\n"
-    "fmla v11.4s, v30.4s, v9.4s\n"
-    "fmla v21.4s, v30.4s, v19.4s\n"
-    "fmla v24.4s, v30.4s, v22.4s\n"
-    "ldr q25, [x25, x27]\n"
-    "fmla v17.4s, v31.4s, v6.4s\n"
-    "ldr x25, [%[inptrs], 0]\n"
-    "fmla v15.4s, v31.4s, v8.4s\n"
-    "fmla v1.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v31.4s, v9.4s\n"
-    "ldr q26, [x16, x27]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 256]\n"
-    "fmla v10.4s, v26.4s, v5.4s\n"
-    "ldr q31, [x15, x27]\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v13.4s, v31.4s, v3.4s\n"
-    "ldr x15, [%[inptrs], 216]\n"
-    "fmla v14.4s, v31.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 176]\n"
-    "fmla v12.4s, v31.4s, v4.4s\n"
-    "fmla v10.4s, v31.4s, v7.4s\n"
-    "fmla v11.4s, v31.4s, v5.4s\n"
-    "fmla v20.4s, v31.4s, v19.4s\n"
-    "fmla v0.4s, v29.4s, v3.4s\n"
-    "ldr q28, [x24, x27]\n"
-    "fmla v15.4s, v29.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 136]\n"
-    "fmla v12.4s, v29.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v8.4s\n"
-    "fmla v11.4s, v29.4s, v7.4s\n"
-    "fmla v21.4s, v29.4s, v5.4s\n"
-    "fmla v20.4s, v29.4s, v9.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v23.4s, v29.4s, v22.4s\n"
-    "ldr q25, [x17, x27]\n"
-    "fmla v17.4s, v28.4s, v3.4s\n"
-    "ldr q29, [x16, x27]\n"
-    "fmla v15.4s, v28.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 264]\n"
-    "fmla v1.4s, v28.4s, v4.4s\n"
-    "ldr x17, [%[inptrs], 48]\n"
-    "fmla v11.4s, v28.4s, v8.4s\n"
-    "fmla v21.4s, v28.4s, v7.4s\n"
-    "fmla v24.4s, v28.4s, v9.4s\n"
-    "ldr q22, [x15, x27]\n"
-    "fmla v14.4s, v29.4s, v3.4s\n"
-    "ldr x15, [%[inptrs], 224]\n"
-    "fmla v1.4s, v25.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v4.4s\n"
-    "fmla v21.4s, v25.4s, v8.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v20.4s, v29.4s, v5.4s\n"
-    "ldr q26, [x24, x27]\n"
-    "fmla v12.4s, v22.4s, v3.4s\n"
-    "ldr x7, [%[inptrs], 184]\n"
-    "fmla v10.4s, v22.4s, v6.4s\n"
-    "ldr x24, [%[inptrs], 96]\n"
-    "fmla v11.4s, v22.4s, v4.4s\n"
-    "fmla v24.4s, v22.4s, v5.4s\n"
-    "fmla v20.4s, v22.4s, v7.4s\n"
-    "fmla v23.4s, v22.4s, v19.4s\n"
-    "fmla v15.4s, v27.4s, v3.4s\n"
-    "ldr q25, [x16, x27]\n"
-    "fmla v21.4s, v27.4s, v4.4s\n"
-    "ldr q31, [x15, x27]\n"
-    "fmla v11.4s, v27.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 272]\n"
-    "fmla v20.4s, v27.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 232]\n"
-    "fmla v24.4s, v27.4s, v7.4s\n"
-    "fmla v23.4s, v27.4s, v9.4s\n"
-    "fmla v1.4s, v26.4s, v3.4s\n"
-    "ldr q22, [x7, x27]\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr q19, [x16, x27]\n"
-    "fmla v10.4s, v25.4s, v3.4s\n"
-    "ldr x16, [%[inptrs], 280]\n"
-    "fmla v24.4s, v26.4s, v8.4s\n"
-    "ldr q28, [x15, x27]\n"
-    "fmla v20.4s, v25.4s, v4.4s\n"
-    "ldr x7, [%[inptrs], 144]\n"
-    "fmla v23.4s, v25.4s, v5.4s\n"
-    "ldr q30, [x16, x27]\n"
-    "fmla v11.4s, v31.4s, v3.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v24.4s, v31.4s, v4.4s\n"
-    "ldr q27, [x25, x27]\n"
-    "fmla v20.4s, v31.4s, v6.4s\n"
-    "ldr x25, [%[inptrs], 8]\n"
-    "fmla v23.4s, v31.4s, v7.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v21.4s, v22.4s, v3.4s\n"
-    "ldr q26, [x17, x27]\n"
-    "fmla v24.4s, v22.4s, v6.4s\n"
-    "ldr x17, [%[inptrs], 56]\n"
-    "fmla v20.4s, v19.4s, v3.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmla v23.4s, v22.4s, v8.4s\n"
-    "ldr q25, [%[wbptr]]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "ldr q22, [%[wbptr], #16]\n"
-    "str q2, [x20, x28]\n"
-    "fmla v24.4s, v28.4s, v3.4s\n"
-    "fmax v17.4s, v17.4s, v29.4s\n"
-    "ldr q9, [%[wbptr], #32]\n"
-    "fmla v23.4s, v19.4s, v4.4s\n"
-    "ldr q8, [%[wbptr], #48]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr q19, [%[wbptr], #64]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 8]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str q18, [x20, x28]\n"
-    "fmla v23.4s, v28.4s, v6.4s\n"
-    "str q16, [x21, x28]\n"
-    "fmax v21.4s, v21.4s, v29.4s\n"
-    "fmax v13.4s, v13.4s, v29.4s\n"
-    "ldr q7, [%[wbptr], #80]\n"
-    "fmax v12.4s, v12.4s, v29.4s\n"
-    "ldr q5, [%[wbptr], #112]\n"
-    "fmla v23.4s, v30.4s, v3.4s\n"
-    "ldr q6, [%[wbptr], #96]\n"
-    "str q13, [x22, x28]\n"
-    "fmax v11.4s, v11.4s, v29.4s\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "ldr q4, [%[wbptr], #128]\n"
-    "fmax v14.4s, v14.4s, v29.4s\n"
-    "ldr q31, [x25, x27]\n"
-    "fmax v10.4s, v10.4s, v29.4s\n"
-    "ldr q3, [%[wbptr], #144]\n"
-    "fmax v20.4s, v20.4s, v29.4s\n"
-    "ldr q28, [x24, x27]\n"
-    "str q14, [x23, x28]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "mov v2.16b, v25.16b\n"
-    "ldr q29, [x17, x27]\n"
-    "ldr x20, [%[outptrs], 16]\n"
-    "ldr x21, [%[outptrs], 40]\n"
-    "ldr x22, [%[outptrs], 72]\n"
-    "ldr x23, [%[outptrs], 104]\n"
-    "ldr x25, [%[inptrs], 16]\n"
-    "ldr x24, [%[inptrs], 104]\n"
-    "str q17, [x20, x28]\n"
-    "mov v16.16b, v25.16b\n"
-    "str q0, [x21, x28]\n"
-    "mov v18.16b, v25.16b\n"
-    "str q12, [x22, x28]\n"
-    "mov v13.16b, v25.16b\n"
-    "str q10, [x23, x28]\n"
-    "mov v0.16b, v25.16b\n"
-    "fmla v2.4s, v27.4s, v22.4s\n"
-    "ldr q30, [x25, x27]\n"
-    "fmla v16.4s, v26.4s, v22.4s\n"
-    "ldr x20, [%[outptrs], 24]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr x21, [%[outptrs], 48]\n"
-    "str q1, [x20, x28]\n"
-    "mov v14.16b, v25.16b\n"
-    "str q15, [x21, x28]\n"
-    "mov v12.16b, v25.16b\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr x21, [%[outptrs], 56]\n"
-    "fmla v2.4s, v26.4s, v19.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "str q21, [x21, x28]\n"
-    "ldr x22, [%[outptrs], 80]\n"
-    "ldr q21, [x24, x27]\n"
-    "ldr x23, [%[outptrs], 112]\n"
-    "str q11, [x22, x28]\n"
-    "fmla v2.4s, v31.4s, v9.4s\n"
-    "str q20, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 88]\n"
-    "ldr x23, [%[outptrs], 120]\n"
-    "str q24, [x22, x28]\n"
-    "str q23, [x23, x28]\n"
-    "add x28, x28, #16\n"
-    "bne 2b\n"
-    "3:\n"
-    "mov v1.16b, v25.16b\n"
-    "ldr x17, [%[inptrs], 64]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr x25, [%[inptrs], 24]\n"
-    "mov v11.16b, v25.16b\n"
-    "ldr x15, [%[inptrs], 192]\n"
-    "fmla v18.4s, v31.4s, v22.4s\n"
-    "ldr q23, [x17, x27]\n"
-    "fmla v2.4s, v28.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 152]\n"
-    "fmla v16.4s, v28.4s, v19.4s\n"
-    "ldr x24, [%[inptrs], 112]\n"
-    "fmla v13.4s, v28.4s, v22.4s\n"
-    "ldr q26, [x25, x27]\n"
-    "fmla v18.4s, v29.4s, v19.4s\n"
-    "ldr x17, [%[inptrs], 72]\n"
-    "fmla v2.4s, v29.4s, v7.4s\n"
-    "ldr x25, [%[inptrs], 32]\n"
-    "fmla v16.4s, v29.4s, v9.4s\n"
-    "ldr x16, [%[inptrs], 240]\n"
-    "fmla v0.4s, v29.4s, v22.4s\n"
-    "ldr q28, [x15, x27]\n"
-    "fmla v18.4s, v30.4s, v9.4s\n"
-    "ldr x15, [%[inptrs], 200]\n"
-    "fmla v2.4s, v30.4s, v8.4s\n"
-    "ldr x20, [%[outptrs], 0]\n"
-    "fmla v17.4s, v30.4s, v22.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v16.4s, v27.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 160]\n"
-    "fmla v13.4s, v27.4s, v19.4s\n"
-    "ldr x21, [%[outptrs], 32]\n"
-    "fmla v14.4s, v27.4s, v22.4s\n"
-    "ldr q20, [x24, x27]\n"
-    "fmla v2.4s, v21.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 120]\n"
-    "fmla v16.4s, v21.4s, v7.4s\n"
-    "ldr x22, [%[outptrs], 64]\n"
-    "fmla v18.4s, v21.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 96]\n"
-    "fmla v13.4s, v21.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v0.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v12.4s, v21.4s, v22.4s\n"
-    "ldr q24, [x17, x27]\n"
-    "fmla v2.4s, v23.4s, v6.4s\n"
-    "ldr x17, [%[inptrs], 80]\n"
-    "fmla v16.4s, v23.4s, v8.4s\n"
-    "fmla v18.4s, v23.4s, v7.4s\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "fmla v17.4s, v23.4s, v19.4s\n"
-    "fmla v15.4s, v23.4s, v22.4s\n"
-    "ldr q23, [x25, x27]\n"
-    "fmla v1.4s, v26.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 40]\n"
-    "fmla v18.4s, v26.4s, v8.4s\n"
-    "fmla v13.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v26.4s, v9.4s\n"
-    "ldr q30, [x16, x27]\n"
-    "fmla v14.4s, v28.4s, v19.4s\n"
-    "ldr q26, [x15, x27]\n"
-    "fmla v16.4s, v29.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 248]\n"
-    "fmla v13.4s, v29.4s, v7.4s\n"
-    "ldr x15, [%[inptrs], 208]\n"
-    "fmla v0.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v19.4s\n"
-    "fmla v14.4s, v29.4s, v9.4s\n"
-    "fmla v10.4s, v29.4s, v22.4s\n"
-    "mov v21.16b, v25.16b\n"
-    "fmla v2.4s, v20.4s, v3.4s\n"
-    "fmla v16.4s, v20.4s, v6.4s\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v13.4s, v20.4s, v8.4s\n"
-    "fmla v0.4s, v20.4s, v7.4s\n"
-    "fmla v17.4s, v20.4s, v5.4s\n"
-    "fmla v12.4s, v20.4s, v9.4s\n"
-    "fmla v15.4s, v20.4s, v19.4s\n"
-    "fmla v11.4s, v20.4s, v22.4s\n"
-    "mov v20.16b, v25.16b\n"
-    "fmla v18.4s, v24.4s, v6.4s\n"
-    "fmla v0.4s, v24.4s, v8.4s\n"
-    "fmla v1.4s, v24.4s, v19.4s\n"
-    "fmla v17.4s, v24.4s, v7.4s\n"
-    "fmla v21.4s, v24.4s, v22.4s\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "ldr q30, [x24, x27]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 168]\n"
-    "fmla v17.4s, v23.4s, v8.4s\n"
-    "ldr q31, [x17, x27]\n"
-    "fmla v13.4s, v26.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 128]\n"
-    "fmla v14.4s, v26.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 88]\n"
-    "fmla v12.4s, v26.4s, v5.4s\n"
-    "fmla v10.4s, v26.4s, v19.4s\n"
-    "mov v24.16b, v25.16b\n"
-    "mov v23.16b, v25.16b\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v0.4s, v27.4s, v4.4s\n"
-    "fmla v14.4s, v27.4s, v8.4s\n"
-    "fmla v12.4s, v27.4s, v7.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "fmla v10.4s, v27.4s, v9.4s\n"
-    "fmla v11.4s, v27.4s, v19.4s\n"
-    "fmla v20.4s, v27.4s, v22.4s\n"
-    "ldr q25, [x25, x27]\n"
-    "fmla v18.4s, v30.4s, v3.4s\n"
-    "fmla v0.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "fmla v12.4s, v30.4s, v8.4s\n"
-    "fmla v15.4s, v30.4s, v7.4s\n"
-    "fmla v1.4s, v30.4s, v5.4s\n"
-    "fmla v11.4s, v30.4s, v9.4s\n"
-    "fmla v21.4s, v30.4s, v19.4s\n"
-    "fmla v24.4s, v30.4s, v22.4s\n"
-    "ldr q26, [x16, x27]\n"
-    "fmla v17.4s, v31.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 256]\n"
-    "fmla v15.4s, v31.4s, v8.4s\n"
-    "fmla v1.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v31.4s, v9.4s\n"
-    "ldr q31, [x15, x27]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr x15, [%[inptrs], 216]\n"
-    "fmla v10.4s, v26.4s, v5.4s\n"
-    "ldr q29, [x7, x27]\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "ldr q28, [x24, x27]\n"
-    "fmla v13.4s, v31.4s, v3.4s\n"
-    "ldr x7, [%[inptrs], 176]\n"
-    "fmla v14.4s, v31.4s, v6.4s\n"
-    "ldr x24, [%[inptrs], 136]\n"
-    "fmla v12.4s, v31.4s, v4.4s\n"
-    "fmla v10.4s, v31.4s, v7.4s\n"
-    "fmla v11.4s, v31.4s, v5.4s\n"
-    "fmla v20.4s, v31.4s, v19.4s\n"
-    "fmla v0.4s, v29.4s, v3.4s\n"
-    "ldr q25, [x17, x27]\n"
-    "fmla v15.4s, v29.4s, v4.4s\n"
-    "fmla v21.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v8.4s\n"
-    "fmla v11.4s, v29.4s, v7.4s\n"
-    "fmla v20.4s, v29.4s, v9.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v23.4s, v29.4s, v22.4s\n"
-    "fmla v17.4s, v28.4s, v3.4s\n"
-    "ldr q29, [x16, x27]\n"
-    "fmla v15.4s, v28.4s, v6.4s\n"
-    "ldr q22, [x15, x27]\n"
-    "fmla v1.4s, v28.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 264]\n"
-    "fmla v11.4s, v28.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 224]\n"
-    "fmla v21.4s, v28.4s, v7.4s\n"
-    "fmla v24.4s, v28.4s, v9.4s\n"
-    "fmla v14.4s, v29.4s, v3.4s\n"
-    "ldr q27, [x7, x27]\n"
-    "fmla v1.4s, v25.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 184]\n"
-    "fmla v10.4s, v29.4s, v4.4s\n"
-    "fmla v20.4s, v29.4s, v5.4s\n"
-    "fmla v21.4s, v25.4s, v8.4s\n"
-    "ldr q26, [x24, x27]\n"
-    "fmla v12.4s, v22.4s, v3.4s\n"
-    "ldr q25, [x16, x27]\n"
-    "fmla v11.4s, v22.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 272]\n"
-    "fmla v10.4s, v22.4s, v6.4s\n"
-    "fmla v20.4s, v22.4s, v7.4s\n"
-    "fmla v24.4s, v22.4s, v5.4s\n"
-    "fmla v23.4s, v22.4s, v19.4s\n"
-    "fmla v15.4s, v27.4s, v3.4s\n"
-    "ldr q31, [x15, x27]\n"
-    "fmla v11.4s, v27.4s, v6.4s\n"
-    "ldr q22, [x7, x27]\n"
-    "fmla v21.4s, v27.4s, v4.4s\n"
-    "ldr x15, [%[inptrs], 232]\n"
-    "fmla v20.4s, v27.4s, v8.4s\n"
-    "fmla v24.4s, v27.4s, v7.4s\n"
-    "fmla v23.4s, v27.4s, v9.4s\n"
-    "ldr q19, [x16, x27]\n"
-    "fmla v1.4s, v26.4s, v3.4s\n"
-    "ldr q28, [x15, x27]\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 280]\n"
-    "fmla v24.4s, v26.4s, v8.4s\n"
-    "fmla v10.4s, v25.4s, v3.4s\n"
-    "fmla v20.4s, v25.4s, v4.4s\n"
-    "ldr q30, [x16, x27]\n"
-    "fmla v23.4s, v25.4s, v5.4s\n"
-    "add x27, x27, #16\n"
-    "fmla v11.4s, v31.4s, v3.4s\n"
-    "fmla v21.4s, v22.4s, v3.4s\n"
-    "fmla v24.4s, v31.4s, v4.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v20.4s, v31.4s, v6.4s\n"
-    "fmla v23.4s, v31.4s, v7.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "fmla v24.4s, v22.4s, v6.4s\n"
-    "fmax v17.4s, v17.4s, v29.4s\n"
-    "fmla v20.4s, v19.4s, v3.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "str q2, [x20, x28]\n"
-    "fmla v23.4s, v22.4s, v8.4s\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 8]\n"
-    "fmla v24.4s, v28.4s, v3.4s\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str q18, [x20, x28]\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str q16, [x21, x28]\n"
-    "fmla v23.4s, v19.4s, v4.4s\n"
-    "fmax v21.4s, v21.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 16]\n"
-    "fmax v13.4s, v13.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 40]\n"
-    "str q17, [x20, x28]\n"
-    "fmax v12.4s, v12.4s, v29.4s\n"
-    "str q0, [x21, x28]\n"
-    "fmla v23.4s, v28.4s, v6.4s\n"
-    "str q13, [x22, x28]\n"
-    "fmax v11.4s, v11.4s, v29.4s\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 24]\n"
-    "fmax v14.4s, v14.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 48]\n"
-    "str q1, [x20, x28]\n"
-    "fmla v23.4s, v30.4s, v3.4s\n"
-    "str q15, [x21, x28]\n"
-    "fmax v10.4s, v10.4s, v29.4s\n"
-    "str q14, [x23, x28]\n"
-    "fmax v20.4s, v20.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 56]\n"
-    "ldr x22, [%[outptrs], 72]\n"
-    "ldr x23, [%[outptrs], 104]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str q21, [x21, x28]\n"
-    "str q12, [x22, x28]\n"
-    "str q10, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 80]\n"
-    "ldr x23, [%[outptrs], 112]\n"
-    "str q11, [x22, x28]\n"
-    "str q20, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 88]\n"
-    "ldr x23, [%[outptrs], 120]\n"
-    "str q24, [x22, x28]\n"
-    "str q23, [x23, x28]\n"
-    "add x28, x28, #16\n"
-    "4:\n"
-    "cbz x19, 7f\n"
-    "ldr s25, [%[wbptr]]\n"
-    "mov v2.16b, v25.16b\n"
-    "ldr s22, [%[wbptr], #4]\n"
-    "mov v16.16b, v25.16b\n"
-    "ldr s9, [%[wbptr], #8]\n"
-    "mov v18.16b, v25.16b\n"
-    "ldr s8, [%[wbptr], #12]\n"
-    "mov v13.16b, v25.16b\n"
-    "ldr s19, [%[wbptr], #16]\n"
-    "mov v0.16b, v25.16b\n"
-    "ldr s7, [%[wbptr], #20]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr s6, [%[wbptr], #24]\n"
-    "mov v14.16b, v25.16b\n"
-    "ldr s5, [%[wbptr], #28]\n"
-    "mov v12.16b, v25.16b\n"
-    "ldr s4, [%[wbptr], #32]\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr s3, [%[wbptr], #36]\n"
-    "ldr x25, [%[inptrs], 0]\n"
-    "ldr x17, [%[inptrs], 48]\n"
-    "ldr x24, [%[inptrs], 96]\n"
-    "ldr x7, [%[inptrs], 144]\n"
-    "subs x19, x19, #1\n"
-    "ldr s27, [x25, x27]\n"
-    "fmla v2.4s, v27.4s, v22.4s\n"
-    "ldr s26, [x17, x27]\n"
-    "fmla v16.4s, v26.4s, v22.4s\n"
-    "ldr s28, [x24, x27]\n"
-    "ldr s27, [x7, x27]\n"
-    "ldr x25, [%[inptrs], 8]\n"
-    "ldr x17, [%[inptrs], 56]\n"
-    "ldr x24, [%[inptrs], 104]\n"
-    "ldr s31, [x25, x27]\n"
-    "fmla v2.4s, v26.4s, v19.4s\n"
-    "ldr s29, [x17, x27]\n"
-    "ldr s21, [x24, x27]\n"
-    "ldr x25, [%[inptrs], 16]\n"
-    "ldr s30, [x25, x27]\n"
-    "fmla v2.4s, v31.4s, v9.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "mov v1.16b, v25.16b\n"
-    "ldr x17, [%[inptrs], 64]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr x25, [%[inptrs], 24]\n"
-    "fmla v18.4s, v31.4s, v22.4s\n"
-    "ldr s23, [x17, x27]\n"
-    "fmla v2.4s, v28.4s, v5.4s\n"
-    "ldr x15, [%[inptrs], 192]\n"
-    "fmla v16.4s, v28.4s, v19.4s\n"
-    "ldr x7, [%[inptrs], 152]\n"
-    "fmla v13.4s, v28.4s, v22.4s\n"
-    "ldr s26, [x25, x27]\n"
-    "fmla v18.4s, v29.4s, v19.4s\n"
-    "ldr x24, [%[inptrs], 112]\n"
-    "fmla v2.4s, v29.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 72]\n"
-    "fmla v16.4s, v29.4s, v9.4s\n"
-    "ldr x25, [%[inptrs], 32]\n"
-    "fmla v0.4s, v29.4s, v22.4s\n"
-    "ldr s28, [x15, x27]\n"
-    "fmla v18.4s, v30.4s, v9.4s\n"
-    "ldr x16, [%[inptrs], 240]\n"
-    "fmla v2.4s, v30.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 200]\n"
-    "fmla v17.4s, v30.4s, v22.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v16.4s, v27.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 160]\n"
-    "fmla v13.4s, v27.4s, v19.4s\n"
-    "ldr x20, [%[outptrs], 0]\n"
-    "fmla v14.4s, v27.4s, v22.4s\n"
-    "ldr s20, [x24, x27]\n"
-    "fmla v2.4s, v21.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 120]\n"
-    "fmla v16.4s, v21.4s, v7.4s\n"
-    "ldr x21, [%[outptrs], 32]\n"
-    "fmla v18.4s, v21.4s, v5.4s\n"
-    "ldr x22, [%[outptrs], 64]\n"
-    "fmla v13.4s, v21.4s, v9.4s\n"
-    "ldr x23, [%[outptrs], 96]\n"
-    "fmla v0.4s, v21.4s, v19.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v12.4s, v21.4s, v22.4s\n"
-    "ldr s24, [x17, x27]\n"
-    "fmla v2.4s, v23.4s, v6.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v16.4s, v23.4s, v8.4s\n"
-    "ldr x17, [%[inptrs], 80]\n"
-    "fmla v18.4s, v23.4s, v7.4s\n"
-    "subs x19, x19, #1\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "fmla v17.4s, v23.4s, v19.4s\n"
-    "fmla v15.4s, v23.4s, v22.4s\n"
-    "ldr s23, [x25, x27]\n"
-    "fmla v1.4s, v26.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 40]\n"
-    "fmla v18.4s, v26.4s, v8.4s\n"
-    "fmla v13.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v26.4s, v9.4s\n"
-    "ldr s30, [x16, x27]\n"
-    "fmla v14.4s, v28.4s, v19.4s\n"
-    "ldr s26, [x15, x27]\n"
-    "fmla v16.4s, v29.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 248]\n"
-    "fmla v13.4s, v29.4s, v7.4s\n"
-    "ldr x15, [%[inptrs], 208]\n"
-    "fmla v0.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v19.4s\n"
-    "fmla v14.4s, v29.4s, v9.4s\n"
-    "fmla v10.4s, v29.4s, v22.4s\n"
-    "mov v11.16b, v25.16b\n"
-    "fmla v2.4s, v20.4s, v3.4s\n"
-    "fmla v16.4s, v20.4s, v6.4s\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v13.4s, v20.4s, v8.4s\n"
-    "fmla v0.4s, v20.4s, v7.4s\n"
-    "fmla v17.4s, v20.4s, v5.4s\n"
-    "fmla v12.4s, v20.4s, v9.4s\n"
-    "fmla v15.4s, v20.4s, v19.4s\n"
-    "fmla v11.4s, v20.4s, v22.4s\n"
-    "mov v21.16b, v25.16b\n"
-    "fmla v18.4s, v24.4s, v6.4s\n"
-    "fmla v0.4s, v24.4s, v8.4s\n"
-    "fmla v1.4s, v24.4s, v19.4s\n"
-    "fmla v17.4s, v24.4s, v7.4s\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "mov v20.16b, v25.16b\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "fmla v21.4s, v24.4s, v22.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 168]\n"
-    "fmla v17.4s, v23.4s, v8.4s\n"
-    "ldr s30, [x24, x27]\n"
-    "fmla v13.4s, v26.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 128]\n"
-    "fmla v14.4s, v26.4s, v7.4s\n"
-    "fmla v12.4s, v26.4s, v5.4s\n"
-    "fmla v10.4s, v26.4s, v19.4s\n"
-    "ldr s31, [x17, x27]\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "ldr x17, [%[inptrs], 88]\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v0.4s, v27.4s, v4.4s\n"
-    "fmla v14.4s, v27.4s, v8.4s\n"
-    "fmla v12.4s, v27.4s, v7.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "fmla v10.4s, v27.4s, v9.4s\n"
-    "fmla v11.4s, v27.4s, v19.4s\n"
-    "fmla v20.4s, v27.4s, v22.4s\n"
-    "mov v24.16b, v25.16b\n"
-    "mov v23.16b, v25.16b\n"
-    "fmla v18.4s, v30.4s, v3.4s\n"
-    "fmla v0.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "fmla v12.4s, v30.4s, v8.4s\n"
-    "fmla v15.4s, v30.4s, v7.4s\n"
-    "fmla v1.4s, v30.4s, v5.4s\n"
-    "fmla v11.4s, v30.4s, v9.4s\n"
-    "fmla v21.4s, v30.4s, v19.4s\n"
-    "fmla v24.4s, v30.4s, v22.4s\n"
-    "ldr s25, [x25, x27]\n"
-    "fmla v17.4s, v31.4s, v6.4s\n"
-    "ldr x25, [%[inptrs], 0]\n"
-    "fmla v15.4s, v31.4s, v8.4s\n"
-    "fmla v1.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v31.4s, v9.4s\n"
-    "ldr s26, [x16, x27]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 256]\n"
-    "fmla v10.4s, v26.4s, v5.4s\n"
-    "ldr s31, [x15, x27]\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v13.4s, v31.4s, v3.4s\n"
-    "ldr x15, [%[inptrs], 216]\n"
-    "fmla v14.4s, v31.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 176]\n"
-    "fmla v12.4s, v31.4s, v4.4s\n"
-    "fmla v10.4s, v31.4s, v7.4s\n"
-    "fmla v11.4s, v31.4s, v5.4s\n"
-    "fmla v20.4s, v31.4s, v19.4s\n"
-    "fmla v0.4s, v29.4s, v3.4s\n"
-    "ldr s28, [x24, x27]\n"
-    "fmla v15.4s, v29.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 136]\n"
-    "fmla v12.4s, v29.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v8.4s\n"
-    "fmla v11.4s, v29.4s, v7.4s\n"
-    "fmla v21.4s, v29.4s, v5.4s\n"
-    "fmla v20.4s, v29.4s, v9.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v23.4s, v29.4s, v22.4s\n"
-    "ldr s25, [x17, x27]\n"
-    "fmla v17.4s, v28.4s, v3.4s\n"
-    "ldr s29, [x16, x27]\n"
-    "fmla v15.4s, v28.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 264]\n"
-    "fmla v1.4s, v28.4s, v4.4s\n"
-    "ldr x17, [%[inptrs], 48]\n"
-    "fmla v11.4s, v28.4s, v8.4s\n"
-    "fmla v21.4s, v28.4s, v7.4s\n"
-    "fmla v24.4s, v28.4s, v9.4s\n"
-    "ldr s22, [x15, x27]\n"
-    "fmla v14.4s, v29.4s, v3.4s\n"
-    "ldr x15, [%[inptrs], 224]\n"
-    "fmla v1.4s, v25.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v4.4s\n"
-    "fmla v21.4s, v25.4s, v8.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v20.4s, v29.4s, v5.4s\n"
-    "ldr s26, [x24, x27]\n"
-    "fmla v12.4s, v22.4s, v3.4s\n"
-    "ldr x7, [%[inptrs], 184]\n"
-    "fmla v10.4s, v22.4s, v6.4s\n"
-    "ldr x24, [%[inptrs], 96]\n"
-    "fmla v11.4s, v22.4s, v4.4s\n"
-    "fmla v24.4s, v22.4s, v5.4s\n"
-    "fmla v20.4s, v22.4s, v7.4s\n"
-    "fmla v23.4s, v22.4s, v19.4s\n"
-    "fmla v15.4s, v27.4s, v3.4s\n"
-    "ldr s25, [x16, x27]\n"
-    "fmla v21.4s, v27.4s, v4.4s\n"
-    "ldr s31, [x15, x27]\n"
-    "fmla v11.4s, v27.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 272]\n"
-    "fmla v20.4s, v27.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 232]\n"
-    "fmla v24.4s, v27.4s, v7.4s\n"
-    "fmla v23.4s, v27.4s, v9.4s\n"
-    "fmla v1.4s, v26.4s, v3.4s\n"
-    "ldr s22, [x7, x27]\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr s19, [x16, x27]\n"
-    "fmla v10.4s, v25.4s, v3.4s\n"
-    "ldr x16, [%[inptrs], 280]\n"
-    "fmla v24.4s, v26.4s, v8.4s\n"
-    "ldr s28, [x15, x27]\n"
-    "fmla v20.4s, v25.4s, v4.4s\n"
-    "ldr x7, [%[inptrs], 144]\n"
-    "fmla v23.4s, v25.4s, v5.4s\n"
-    "ldr s30, [x16, x27]\n"
-    "fmla v11.4s, v31.4s, v3.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v24.4s, v31.4s, v4.4s\n"
-    "ldr s27, [x25, x27]\n"
-    "fmla v20.4s, v31.4s, v6.4s\n"
-    "ldr x25, [%[inptrs], 8]\n"
-    "fmla v23.4s, v31.4s, v7.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v21.4s, v22.4s, v3.4s\n"
-    "ldr s26, [x17, x27]\n"
-    "fmla v24.4s, v22.4s, v6.4s\n"
-    "ldr x17, [%[inptrs], 56]\n"
-    "fmla v20.4s, v19.4s, v3.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmla v23.4s, v22.4s, v8.4s\n"
-    "ldr s25, [%[wbptr]]\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "ldr s22, [%[wbptr], #4]\n"
-    "str s2, [x20, x28]\n"
-    "fmla v24.4s, v28.4s, v3.4s\n"
-    "fmax v17.4s, v17.4s, v29.4s\n"
-    "ldr s9, [%[wbptr], #8]\n"
-    "fmla v23.4s, v19.4s, v4.4s\n"
-    "ldr s8, [%[wbptr], #12]\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "ldr s19, [%[wbptr], #16]\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 8]\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str s18, [x20, x28]\n"
-    "fmla v23.4s, v28.4s, v6.4s\n"
-    "str s16, [x21, x28]\n"
-    "fmax v21.4s, v21.4s, v29.4s\n"
-    "fmax v13.4s, v13.4s, v29.4s\n"
-    "ldr s7, [%[wbptr], #20]\n"
-    "fmax v12.4s, v12.4s, v29.4s\n"
-    "ldr s5, [%[wbptr], #28]\n"
-    "fmla v23.4s, v30.4s, v3.4s\n"
-    "ldr s6, [%[wbptr], #24]\n"
-    "str s13, [x22, x28]\n"
-    "fmax v11.4s, v11.4s, v29.4s\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "ldr s4, [%[wbptr], #32]\n"
-    "fmax v14.4s, v14.4s, v29.4s\n"
-    "ldr s31, [x25, x27]\n"
-    "fmax v10.4s, v10.4s, v29.4s\n"
-    "ldr s3, [%[wbptr], #36]\n"
-    "fmax v20.4s, v20.4s, v29.4s\n"
-    "ldr s28, [x24, x27]\n"
-    "str s14, [x23, x28]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "mov v2.16b, v25.16b\n"
-    "ldr s29, [x17, x27]\n"
-    "ldr x20, [%[outptrs], 16]\n"
-    "ldr x21, [%[outptrs], 40]\n"
-    "ldr x22, [%[outptrs], 72]\n"
-    "ldr x23, [%[outptrs], 104]\n"
-    "ldr x25, [%[inptrs], 16]\n"
-    "ldr x24, [%[inptrs], 104]\n"
-    "str s17, [x20, x28]\n"
-    "mov v16.16b, v25.16b\n"
-    "str s0, [x21, x28]\n"
-    "mov v18.16b, v25.16b\n"
-    "str s12, [x22, x28]\n"
-    "mov v13.16b, v25.16b\n"
-    "str s10, [x23, x28]\n"
-    "mov v0.16b, v25.16b\n"
-    "fmla v2.4s, v27.4s, v22.4s\n"
-    "ldr s30, [x25, x27]\n"
-    "fmla v16.4s, v26.4s, v22.4s\n"
-    "ldr x20, [%[outptrs], 24]\n"
-    "mov v17.16b, v25.16b\n"
-    "ldr x21, [%[outptrs], 48]\n"
-    "str s1, [x20, x28]\n"
-    "mov v14.16b, v25.16b\n"
-    "str s15, [x21, x28]\n"
-    "mov v12.16b, v25.16b\n"
-    "mov v15.16b, v25.16b\n"
-    "ldr x21, [%[outptrs], 56]\n"
-    "fmla v2.4s, v26.4s, v19.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "str s21, [x21, x28]\n"
-    "ldr x22, [%[outptrs], 80]\n"
-    "ldr s21, [x24, x27]\n"
-    "ldr x23, [%[outptrs], 112]\n"
-    "str s11, [x22, x28]\n"
-    "fmla v2.4s, v31.4s, v9.4s\n"
-    "str s20, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 88]\n"
-    "ldr x23, [%[outptrs], 120]\n"
-    "str s24, [x22, x28]\n"
-    "str s23, [x23, x28]\n"
-    "add x28, x28, #4\n"
-    "bne 5b\n"
-    "6:\n"
-    "mov v1.16b, v25.16b\n"
-    "ldr x17, [%[inptrs], 64]\n"
-    "mov v10.16b, v25.16b\n"
-    "ldr x25, [%[inptrs], 24]\n"
-    "mov v11.16b, v25.16b\n"
-    "ldr x15, [%[inptrs], 192]\n"
-    "fmla v18.4s, v31.4s, v22.4s\n"
-    "ldr s23, [x17, x27]\n"
-    "fmla v2.4s, v28.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 152]\n"
-    "fmla v16.4s, v28.4s, v19.4s\n"
-    "ldr x24, [%[inptrs], 112]\n"
-    "fmla v13.4s, v28.4s, v22.4s\n"
-    "ldr s26, [x25, x27]\n"
-    "fmla v18.4s, v29.4s, v19.4s\n"
-    "ldr x17, [%[inptrs], 72]\n"
-    "fmla v2.4s, v29.4s, v7.4s\n"
-    "ldr x25, [%[inptrs], 32]\n"
-    "fmla v16.4s, v29.4s, v9.4s\n"
-    "ldr x16, [%[inptrs], 240]\n"
-    "fmla v0.4s, v29.4s, v22.4s\n"
-    "ldr s28, [x15, x27]\n"
-    "fmla v18.4s, v30.4s, v9.4s\n"
-    "ldr x15, [%[inptrs], 200]\n"
-    "fmla v2.4s, v30.4s, v8.4s\n"
-    "ldr x20, [%[outptrs], 0]\n"
-    "fmla v17.4s, v30.4s, v22.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v16.4s, v27.4s, v5.4s\n"
-    "ldr x7, [%[inptrs], 160]\n"
-    "fmla v13.4s, v27.4s, v19.4s\n"
-    "ldr x21, [%[outptrs], 32]\n"
-    "fmla v14.4s, v27.4s, v22.4s\n"
-    "ldr s20, [x24, x27]\n"
-    "fmla v2.4s, v21.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 120]\n"
-    "fmla v16.4s, v21.4s, v7.4s\n"
-    "ldr x22, [%[outptrs], 64]\n"
-    "fmla v18.4s, v21.4s, v5.4s\n"
-    "ldr x23, [%[outptrs], 96]\n"
-    "fmla v13.4s, v21.4s, v9.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v0.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v12.4s, v21.4s, v22.4s\n"
-    "ldr s24, [x17, x27]\n"
-    "fmla v2.4s, v23.4s, v6.4s\n"
-    "ldr x17, [%[inptrs], 80]\n"
-    "fmla v16.4s, v23.4s, v8.4s\n"
-    "fmla v18.4s, v23.4s, v7.4s\n"
-    "fmla v0.4s, v23.4s, v9.4s\n"
-    "fmla v17.4s, v23.4s, v19.4s\n"
-    "fmla v15.4s, v23.4s, v22.4s\n"
-    "ldr s23, [x25, x27]\n"
-    "fmla v1.4s, v26.4s, v22.4s\n"
-    "ldr x25, [%[inptrs], 40]\n"
-    "fmla v18.4s, v26.4s, v8.4s\n"
-    "fmla v13.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v26.4s, v9.4s\n"
-    "ldr s30, [x16, x27]\n"
-    "fmla v14.4s, v28.4s, v19.4s\n"
-    "ldr s26, [x15, x27]\n"
-    "fmla v16.4s, v29.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 248]\n"
-    "fmla v13.4s, v29.4s, v7.4s\n"
-    "ldr x15, [%[inptrs], 208]\n"
-    "fmla v0.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v19.4s\n"
-    "fmla v14.4s, v29.4s, v9.4s\n"
-    "fmla v10.4s, v29.4s, v22.4s\n"
-    "mov v21.16b, v25.16b\n"
-    "fmla v2.4s, v20.4s, v3.4s\n"
-    "fmla v16.4s, v20.4s, v6.4s\n"
-    "fmla v18.4s, v20.4s, v4.4s\n"
-    "fmla v13.4s, v20.4s, v8.4s\n"
-    "fmla v0.4s, v20.4s, v7.4s\n"
-    "fmla v17.4s, v20.4s, v5.4s\n"
-    "fmla v12.4s, v20.4s, v9.4s\n"
-    "fmla v15.4s, v20.4s, v19.4s\n"
-    "fmla v11.4s, v20.4s, v22.4s\n"
-    "mov v20.16b, v25.16b\n"
-    "fmla v18.4s, v24.4s, v6.4s\n"
-    "fmla v0.4s, v24.4s, v8.4s\n"
-    "fmla v1.4s, v24.4s, v19.4s\n"
-    "fmla v17.4s, v24.4s, v7.4s\n"
-    "fmla v21.4s, v24.4s, v22.4s\n"
-    "fmla v15.4s, v24.4s, v9.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v14.4s, v30.4s, v5.4s\n"
-    "ldr s30, [x24, x27]\n"
-    "fmla v1.4s, v23.4s, v9.4s\n"
-    "ldr x7, [%[inptrs], 168]\n"
-    "fmla v17.4s, v23.4s, v8.4s\n"
-    "ldr s31, [x17, x27]\n"
-    "fmla v13.4s, v26.4s, v4.4s\n"
-    "ldr x24, [%[inptrs], 128]\n"
-    "fmla v14.4s, v26.4s, v7.4s\n"
-    "ldr x17, [%[inptrs], 88]\n"
-    "fmla v12.4s, v26.4s, v5.4s\n"
-    "fmla v10.4s, v26.4s, v19.4s\n"
-    "mov v24.16b, v25.16b\n"
-    "mov v23.16b, v25.16b\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v0.4s, v27.4s, v4.4s\n"
-    "fmla v14.4s, v27.4s, v8.4s\n"
-    "fmla v12.4s, v27.4s, v7.4s\n"
-    "fmla v15.4s, v27.4s, v5.4s\n"
-    "fmla v10.4s, v27.4s, v9.4s\n"
-    "fmla v11.4s, v27.4s, v19.4s\n"
-    "fmla v20.4s, v27.4s, v22.4s\n"
-    "ldr s25, [x25, x27]\n"
-    "fmla v18.4s, v30.4s, v3.4s\n"
-    "fmla v0.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v4.4s\n"
-    "fmla v12.4s, v30.4s, v8.4s\n"
-    "fmla v15.4s, v30.4s, v7.4s\n"
-    "fmla v1.4s, v30.4s, v5.4s\n"
-    "fmla v11.4s, v30.4s, v9.4s\n"
-    "fmla v21.4s, v30.4s, v19.4s\n"
-    "fmla v24.4s, v30.4s, v22.4s\n"
-    "ldr s26, [x16, x27]\n"
-    "fmla v17.4s, v31.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 256]\n"
-    "fmla v15.4s, v31.4s, v8.4s\n"
-    "fmla v1.4s, v31.4s, v7.4s\n"
-    "fmla v21.4s, v31.4s, v9.4s\n"
-    "ldr s31, [x15, x27]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "ldr x15, [%[inptrs], 216]\n"
-    "fmla v10.4s, v26.4s, v5.4s\n"
-    "ldr s29, [x7, x27]\n"
-    "fmla v1.4s, v25.4s, v8.4s\n"
-    "ldr s28, [x24, x27]\n"
-    "fmla v13.4s, v31.4s, v3.4s\n"
-    "ldr x7, [%[inptrs], 176]\n"
-    "fmla v14.4s, v31.4s, v6.4s\n"
-    "ldr x24, [%[inptrs], 136]\n"
-    "fmla v12.4s, v31.4s, v4.4s\n"
-    "fmla v10.4s, v31.4s, v7.4s\n"
-    "fmla v11.4s, v31.4s, v5.4s\n"
-    "fmla v20.4s, v31.4s, v19.4s\n"
-    "fmla v0.4s, v29.4s, v3.4s\n"
-    "ldr s25, [x17, x27]\n"
-    "fmla v15.4s, v29.4s, v4.4s\n"
-    "fmla v21.4s, v29.4s, v5.4s\n"
-    "fmla v12.4s, v29.4s, v6.4s\n"
-    "fmla v10.4s, v29.4s, v8.4s\n"
-    "fmla v11.4s, v29.4s, v7.4s\n"
-    "fmla v20.4s, v29.4s, v9.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v23.4s, v29.4s, v22.4s\n"
-    "fmla v17.4s, v28.4s, v3.4s\n"
-    "ldr s29, [x16, x27]\n"
-    "fmla v15.4s, v28.4s, v6.4s\n"
-    "ldr s22, [x15, x27]\n"
-    "fmla v1.4s, v28.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 264]\n"
-    "fmla v11.4s, v28.4s, v8.4s\n"
-    "ldr x15, [%[inptrs], 224]\n"
-    "fmla v21.4s, v28.4s, v7.4s\n"
-    "fmla v24.4s, v28.4s, v9.4s\n"
-    "fmla v14.4s, v29.4s, v3.4s\n"
-    "ldr s27, [x7, x27]\n"
-    "fmla v1.4s, v25.4s, v6.4s\n"
-    "ldr x7, [%[inptrs], 184]\n"
-    "fmla v10.4s, v29.4s, v4.4s\n"
-    "fmla v20.4s, v29.4s, v5.4s\n"
-    "fmla v21.4s, v25.4s, v8.4s\n"
-    "ldr s26, [x24, x27]\n"
-    "fmla v12.4s, v22.4s, v3.4s\n"
-    "ldr s25, [x16, x27]\n"
-    "fmla v11.4s, v22.4s, v4.4s\n"
-    "ldr x16, [%[inptrs], 272]\n"
-    "fmla v10.4s, v22.4s, v6.4s\n"
-    "fmla v20.4s, v22.4s, v7.4s\n"
-    "fmla v24.4s, v22.4s, v5.4s\n"
-    "fmla v23.4s, v22.4s, v19.4s\n"
-    "fmla v15.4s, v27.4s, v3.4s\n"
-    "ldr s31, [x15, x27]\n"
-    "fmla v11.4s, v27.4s, v6.4s\n"
-    "ldr s22, [x7, x27]\n"
-    "fmla v21.4s, v27.4s, v4.4s\n"
-    "ldr x15, [%[inptrs], 232]\n"
-    "fmla v20.4s, v27.4s, v8.4s\n"
-    "fmla v24.4s, v27.4s, v7.4s\n"
-    "fmla v23.4s, v27.4s, v9.4s\n"
-    "ldr s19, [x16, x27]\n"
-    "fmla v1.4s, v26.4s, v3.4s\n"
-    "ldr s28, [x15, x27]\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr x16, [%[inptrs], 280]\n"
-    "fmla v24.4s, v26.4s, v8.4s\n"
-    "fmla v10.4s, v25.4s, v3.4s\n"
-    "fmla v20.4s, v25.4s, v4.4s\n"
-    "ldr s30, [x16, x27]\n"
-    "fmla v23.4s, v25.4s, v5.4s\n"
-    "add x27, x27, #4\n"
-    "fmla v11.4s, v31.4s, v3.4s\n"
-    "fmla v21.4s, v22.4s, v3.4s\n"
-    "fmla v24.4s, v31.4s, v4.4s\n"
-    "movi v29.16b, #0\n"
-    "fmla v20.4s, v31.4s, v6.4s\n"
-    "fmla v23.4s, v31.4s, v7.4s\n"
-    "fmax v2.4s, v2.4s, v29.4s\n"
-    "fmax v18.4s, v18.4s, v29.4s\n"
-    "fmla v24.4s, v22.4s, v6.4s\n"
-    "fmax v17.4s, v17.4s, v29.4s\n"
-    "fmla v20.4s, v19.4s, v3.4s\n"
-    "fmax v1.4s, v1.4s, v29.4s\n"
-    "str s2, [x20, x28]\n"
-    "fmla v23.4s, v22.4s, v8.4s\n"
-    "fmax v16.4s, v16.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 8]\n"
-    "fmla v24.4s, v28.4s, v3.4s\n"
-    "fmax v0.4s, v0.4s, v29.4s\n"
-    "str s18, [x20, x28]\n"
-    "fmax v15.4s, v15.4s, v29.4s\n"
-    "str s16, [x21, x28]\n"
-    "fmla v23.4s, v19.4s, v4.4s\n"
-    "fmax v21.4s, v21.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 16]\n"
-    "fmax v13.4s, v13.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 40]\n"
-    "str s17, [x20, x28]\n"
-    "fmax v12.4s, v12.4s, v29.4s\n"
-    "str s0, [x21, x28]\n"
-    "fmla v23.4s, v28.4s, v6.4s\n"
-    "str s13, [x22, x28]\n"
-    "fmax v11.4s, v11.4s, v29.4s\n"
-    "fmax v24.4s, v24.4s, v29.4s\n"
-    "ldr x20, [%[outptrs], 24]\n"
-    "fmax v14.4s, v14.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 48]\n"
-    "str s1, [x20, x28]\n"
-    "fmla v23.4s, v30.4s, v3.4s\n"
-    "str s15, [x21, x28]\n"
-    "fmax v10.4s, v10.4s, v29.4s\n"
-    "str s14, [x23, x28]\n"
-    "fmax v20.4s, v20.4s, v29.4s\n"
-    "ldr x21, [%[outptrs], 56]\n"
-    "ldr x22, [%[outptrs], 72]\n"
-    "ldr x23, [%[outptrs], 104]\n"
-    "fmax v23.4s, v23.4s, v29.4s\n"
-    "str s21, [x21, x28]\n"
-    "str s12, [x22, x28]\n"
-    "str s10, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 80]\n"
-    "ldr x23, [%[outptrs], 112]\n"
-    "str s11, [x22, x28]\n"
-    "str s20, [x23, x28]\n"
-    "ldr x22, [%[outptrs], 88]\n"
-    "ldr x23, [%[outptrs], 120]\n"
-    "str s24, [x22, x28]\n"
-    "str s23, [x23, x28]\n"
-    "add x28, x28, #4\n"
-    "7:\n"
-    : [wbptr] "+r" (weight_bias_ptr)
-    : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
-  );
-}
-
-template <>
-template <>
-void Conv::execute_tile<ActivationFunction::ReLU6>(
-  int n_channels,
-  const void *weight_bias_ptr,
-  const float *input,
-  const unsigned int input_row_stride,
-  const unsigned int input_col_stride,
-  float *output,
-  const unsigned int output_row_stride,
-  const unsigned int output_col_stride
-)
-{
-  __asm __volatile(
-    "add x24, %[inptr0], %[input_row_stride]\n"
-    "add x13, %[input_col_stride1], %[input_col_stride1]\n"
-    "add x8, %[outptr0], %[output_row_stride]\n"
-    "add x9, x24, %[input_row_stride]\n"
-    "add x10, x13, #64\n"
-    "add x19, x13, %[input_col_stride1]\n"
-    "add x20, x9, %[input_row_stride]\n"
-    "add x21, x19, #64\n"
-    "add x17, x19, %[input_col_stride1]\n"
-    "add x22, x20, %[input_row_stride]\n"
-    "add x7, x17, #64\n"
-    "add x11, x17, %[input_col_stride1]\n"
-    "add x23, x22, %[input_row_stride]\n"
-    "add x12, x11, #64\n"
-    "add x25, x8, %[output_row_stride]\n"
-    "add x26, x25, %[output_row_stride]\n"
-    "add x27, %[output_col_stride1], %[output_col_stride1]\n"
-    "and x14, %[n_channels], #3\n"
-    "add x28, x27, %[output_col_stride1]\n"
-    "lsr x15, %[n_channels], #2\n"
-    "cbz x15, 4f\n"
-    "1:\n"
-    "ldr q23, [%[wbptr]]\n"
-    "subs x15, x15, #1\n"
-    "mov v12.16b, v23.16b\n"
-    "ldr q20, [%[wbptr], #16]\n"
-    "mov v8.16b, v23.16b\n"
-    "ldr q6, [%[wbptr], #32]\n"
-    "mov v11.16b, v23.16b\n"
-    "ldr q5, [%[wbptr], #48]\n"
-    "mov v16.16b, v23.16b\n"
-    "ldr q19, [%[wbptr], #64]\n"
-    "mov v7.16b, v23.16b\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "mov v10.16b, v23.16b\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "mov v14.16b, v23.16b\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "mov v15.16b, v23.16b\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "mov v17.16b, v23.16b\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "mov v9.16b, v23.16b\n"
-    "ldr q28, [%[inptr0]]\n"
-    "fmla v12.4s, v28.4s, v20.4s\n"
-    "ldr q25, [x24]\n"
-    "fmla v8.4s, v25.4s, v20.4s\n"
-    "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v11.4s, v18.4s, v20.4s\n"
-    "ldr q30, [x9]\n"
-    "fmla v12.4s, v25.4s, v19.4s\n"
-    "ldr q29, [x24, %[input_col_stride1]]\n"
-    "fmla v8.4s, v30.4s, v19.4s\n"
-    "ldr q24, [%[inptr0], x13]\n"
-    "fmla v16.4s, v30.4s, v20.4s\n"
-    "ldr q27, [x20]\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "ldr q22, [x9, %[input_col_stride1]]\n"
-    "fmla v8.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v12.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "prfm pldl1keep, [x24, x16]\n"
-    "prfm pldl1keep, [%[inptr0], x10]\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v12.4s, v29.4s, v4.4s\n"
-    "beq 3f\n"
-    "2:\n"
-    "mov v13.16b, v23.16b\n"
-    "ldr q21, [x24, x13]\n"
-    "mov v18.16b, v23.16b\n"
-    "prfm pldl1keep, [x24, x10]\n"
-    "fmla v11.4s, v29.4s, v19.4s\n"
-    "prfm pldl1keep, [%[inptr0], x21]\n"
-    "fmla v7.4s, v29.4s, v20.4s\n"
-    "ldr q25, [%[inptr0], x19]\n"
-    "fmla v12.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v11.4s, v24.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v10.4s, v24.4s, v20.4s\n"
-    "ldr q24, [x22]\n"
-    "fmla v8.4s, v27.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x10]\n"
-    "fmla v16.4s, v27.4s, v19.4s\n"
-    "prfm pldl1keep, [x24, x21]\n"
-    "fmla v14.4s, v27.4s, v20.4s\n"
-    "ldr q26, [x20, %[input_col_stride1]]\n"
-    "fmla v12.4s, v22.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v8.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v11.4s, v22.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x10]\n"
-    "fmla v7.4s, v22.4s, v19.4s\n"
-    "prfm pldl1keep, [x9, x21]\n"
-    "fmla v15.4s, v22.4s, v20.4s\n"
-    "ldr q30, [x9, x13]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x24, x7]\n"
-    "fmla v8.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x12]\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v7.4s, v21.4s, v6.4s\n"
-    "prfm pldl1keep, [x22, x10]\n"
-    "fmla v10.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [x20, x21]\n"
-    "fmla v17.4s, v21.4s, v20.4s\n"
-    "ldr q22, [x24, x19]\n"
-    "fmla v11.4s, v25.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v10.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x24, x12]\n"
-    "fmla v9.4s, v25.4s, v20.4s\n"
-    "ldr q21, [%[inptr0], x17]\n"
-    "fmla v16.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x10]\n"
-    "fmla v14.4s, v24.4s, v19.4s\n"
-    "ldr q24, [x23]\n"
-    "fmla v8.4s, v26.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x21]\n"
-    "fmla v16.4s, v26.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x7]\n"
-    "fmla v7.4s, v26.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x12]\n"
-    "fmla v14.4s, v26.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x21]\n"
-    "fmla v15.4s, v26.4s, v19.4s\n"
-    "prfm pldl1keep, [x22, x7]\n"
-    "fmla v13.4s, v26.4s, v20.4s\n"
-    "ldr q26, [x22, %[input_col_stride1]]\n"
-    "fmla v12.4s, v30.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, x12]\n"
-    "fmla v8.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x23, x7]\n"
-    "fmla v11.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x12]\n"
-    "fmla v16.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x23, x12]\n"
-    "fmla v7.4s, v30.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v10.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "subs x15, x15, #1\n"
-    "fmla v17.4s, v30.4s, v19.4s\n"
-    "fmla v18.4s, v30.4s, v20.4s\n"
-    "mov v25.16b, v23.16b\n"
-    "fmla v11.4s, v22.4s, v3.4s\n"
-    "fmla v7.4s, v22.4s, v5.4s\n"
-    "fmla v10.4s, v22.4s, v4.4s\n"
-    "fmla v17.4s, v22.4s, v6.4s\n"
-    "fmla v9.4s, v22.4s, v19.4s\n"
-    "fmla v25.4s, v22.4s, v20.4s\n"
-    "ldr q27, [x20, x13]\n"
-    "fmla v10.4s, v21.4s, v5.4s\n"
-    "fmla v14.4s, v24.4s, v2.4s\n"
-    "mov v22.16b, v23.16b\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "mov v24.16b, v23.16b\n"
-    "mov v21.16b, v23.16b\n"
-    "fmla v16.4s, v26.4s, v1.4s\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v13.4s, v26.4s, v19.4s\n"
-    "fmla v8.4s, v27.4s, v0.4s\n"
-    "ldr q28, [x9, x19]\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v7.4s, v27.4s, v1.4s\n"
-    "fmla v14.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v18.4s, v27.4s, v19.4s\n"
-    "fmla v22.4s, v27.4s, v20.4s\n"
-    "fmla v11.4s, v28.4s, v0.4s\n"
-    "ldr q29, [x24, x17]\n"
-    "fmla v7.4s, v28.4s, v3.4s\n"
-    "fmla v10.4s, v28.4s, v1.4s\n"
-    "fmla v15.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v28.4s, v4.4s\n"
-    "fmla v9.4s, v28.4s, v2.4s\n"
-    "fmla v18.4s, v28.4s, v6.4s\n"
-    "fmla v25.4s, v28.4s, v19.4s\n"
-    "fmla v24.4s, v28.4s, v20.4s\n"
-    "fmla v10.4s, v29.4s, v3.4s\n"
-    "ldr q23, [%[inptr0], x11]\n"
-    "fmla v17.4s, v29.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v9.4s, v29.4s, v4.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v25.4s, v29.4s, v6.4s\n"
-    "ldr q30, [x23, %[input_col_stride1]]\n"
-    "fmla v14.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v9.4s, v23.4s, v5.4s\n"
-    "ldr q23, [x22, x13]\n"
-    "fmla v13.4s, v30.4s, v2.4s\n"
-    "ldr q29, [x20, x19]\n"
-    "fmla v16.4s, v23.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x10]\n"
-    "fmla v14.4s, v23.4s, v3.4s\n"
-    "fmla v15.4s, v23.4s, v1.4s\n"
-    "fmla v13.4s, v23.4s, v4.4s\n"
-    "fmla v18.4s, v23.4s, v2.4s\n"
-    "fmla v22.4s, v23.4s, v19.4s\n"
-    "ldr q23, [x9, x17]\n"
-    "fmla v7.4s, v29.4s, v0.4s\n"
-    "fmla v15.4s, v29.4s, v3.4s\n"
-    "fmla v17.4s, v29.4s, v1.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v2.4s\n"
-    "fmla v22.4s, v29.4s, v6.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v21.4s, v29.4s, v20.4s\n"
-    "ldr q26, [x24, x11]\n"
-    "fmla v10.4s, v23.4s, v0.4s\n"
-    "ldr q28, [x23, x13]\n"
-    "fmla v17.4s, v23.4s, v3.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v9.4s, v23.4s, v1.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v18.4s, v23.4s, v5.4s\n"
-    "prfm pldl1keep, [x24, x16]\n"
-    "fmla v25.4s, v23.4s, v4.4s\n"
-    "fmla v24.4s, v23.4s, v6.4s\n"
-    "fmla v9.4s, v26.4s, v3.4s\n"
-    "ldr q20, [x22, x19]\n"
-    "fmla v14.4s, v28.4s, v0.4s\n"
-    "fmla v13.4s, v28.4s, v1.4s\n"
-    "fmla v25.4s, v26.4s, v5.4s\n"
-    "ldr q26, [x20, x17]\n"
-    "fmla v22.4s, v28.4s, v2.4s\n"
-    "ldr q23, [x9, x11]\n"
-    "fmla v15.4s, v20.4s, v0.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v13.4s, v20.4s, v3.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v18.4s, v20.4s, v1.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v22.4s, v20.4s, v4.4s\n"
-    "fmla v24.4s, v20.4s, v2.4s\n"
-    "fmla v21.4s, v20.4s, v19.4s\n"
-    "ldr q27, [x23, x19]\n"
-    "fmla v17.4s, v26.4s, v0.4s\n"
-    "ldr q20, [x22, x17]\n"
-    "fmla v18.4s, v26.4s, v3.4s\n"
-    "fmla v25.4s, v26.4s, v1.4s\n"
-    "fmla v22.4s, v26.4s, v5.4s\n"
-    "fmla v24.4s, v26.4s, v4.4s\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr q19, [x20, x11]\n"
-    "fmla v9.4s, v23.4s, v0.4s\n"
-    "ldr q28, [x23, x17]\n"
-    "fmla v25.4s, v23.4s, v3.4s\n"
-    "add x20, x20, #16\n"
-    "fmla v24.4s, v23.4s, v5.4s\n"
-    "ldr q29, [x22, x11]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v22.4s, v27.4s, v1.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v21.4s, v27.4s, v2.4s\n"
-    "ldr q30, [x23, x11]\n"
-    "fmla v18.4s, v20.4s, v0.4s\n"
-    "ldr q23, [%[wbptr]]\n"
-    "fmla v22.4s, v20.4s, v3.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v24.4s, v20.4s, v1.4s\n"
-    "fmla v21.4s, v20.4s, v4.4s\n"
-    "fmla v25.4s, v19.4s, v0.4s\n"
-    "ldr q20, [%[wbptr], #16]\n"
-    "fmla v22.4s, v28.4s, v0.4s\n"
-    "ldr q6, [%[wbptr], #32]\n"
-    "fmla v21.4s, v19.4s, v5.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v24.4s, v19.4s, v3.4s\n"
-    "ldr q19, [%[wbptr], #64]\n"
-    "fmax v12.4s, v12.4s, v26.4s\n"
-    "fmax v11.4s, v11.4s, v26.4s\n"
-    "fmla v21.4s, v28.4s, v1.4s\n"
-    "ldr q5, [%[wbptr], #48]\n"
-    "fmla v24.4s, v29.4s, v0.4s\n"
-    "ldr q4, [%[wbptr], #80]\n"
-    "fmax v10.4s, v10.4s, v26.4s\n"
-    "fmax v9.4s, v9.4s, v26.4s\n"
-    "fmla v21.4s, v29.4s, v3.4s\n"
-    "ldr q2, [%[wbptr], #112]\n"
-    "fmov v27.4s, #6.0\n"
-    "fmax v8.4s, v8.4s, v26.4s\n"
-    "fmax v7.4s, v7.4s, v26.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmla v21.4s, v30.4s, v0.4s\n"
-    "ldr q3, [%[wbptr], #96]\n"
-    "fmin v12.4s, v12.4s, v27.4s\n"
-    "ldr q1, [%[wbptr], #128]\n"
-    "fmin v11.4s, v11.4s, v27.4s\n"
-    "fmin v10.4s, v10.4s, v27.4s\n"
-    "str q12, [%[outptr0]]\n"
-    "fmin v9.4s, v9.4s, v27.4s\n"
-    "str q11, [%[outptr0], %[output_col_stride1]]\n"
-    "fmin v8.4s, v8.4s, v27.4s\n"
-    "str q10, [%[outptr0], x27]\n"
-    "fmin v7.4s, v7.4s, v27.4s\n"
-    "str q9, [%[outptr0], x28]\n"
-    "fmin v17.4s, v17.4s, v27.4s\n"
-    "str q8, [x8]\n"
-    "fmax v25.4s, v25.4s, v26.4s\n"
-    "str q7, [x8, %[output_col_stride1]]\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str q17, [x8, x27]\n"
-    "fmin v25.4s, v25.4s, v27.4s\n"
-    "fmin v16.4s, v16.4s, v27.4s\n"
-    "ldr q0, [%[wbptr], #144]\n"
-    "str q25, [x8, x28]\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "str q16, [x25]\n"
-    "fmax v18.4s, v18.4s, v26.4s\n"
-    "fmin v15.4s, v15.4s, v27.4s\n"
-    "ldr q28, [%[inptr0]]\n"
-    "fmin v18.4s, v18.4s, v27.4s\n"
-    "ldr q25, [x24]\n"
-    "str q15, [x25, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v26.4s\n"
-    "str q18, [x25, x27]\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "fmin v24.4s, v24.4s, v27.4s\n"
-    "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
-    "fmin v14.4s, v14.4s, v27.4s\n"
-    "ldr q30, [x9]\n"
-    "str q24, [x25, x28]\n"
-    "fmax v13.4s, v13.4s, v26.4s\n"
-    "str q14, [x26]\n"
-    "fmax v22.4s, v22.4s, v26.4s\n"
-    "fmin v13.4s, v13.4s, v27.4s\n"
-    "ldr q29, [x24, %[input_col_stride1]]\n"
-    "fmin v22.4s, v22.4s, v27.4s\n"
-    "ldr q24, [%[inptr0], x13]\n"
-    "str q13, [x26, %[output_col_stride1]]\n"
-    "fmax v21.4s, v21.4s, v26.4s\n"
-    "str q22, [x26, x27]\n"
-    "mov v12.16b, v23.16b\n"
-    "fmin v21.4s, v21.4s, v27.4s\n"
-    "ldr q27, [x20]\n"
-    "mov v8.16b, v23.16b\n"
-    "ldr q22, [x9, %[input_col_stride1]]\n"
-    "str q21, [x26, x28]\n"
-    "mov v11.16b, v23.16b\n"
-    "mov v16.16b, v23.16b\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "mov v7.16b, v23.16b\n"
-    "add x8, x8, #16\n"
-    "mov v10.16b, v23.16b\n"
-    "add x25, x25, #16\n"
-    "mov v14.16b, v23.16b\n"
-    "add x26, x26, #16\n"
-    "mov v15.16b, v23.16b\n"
-    "mov v17.16b, v23.16b\n"
-    "mov v9.16b, v23.16b\n"
-    "fmla v12.4s, v28.4s, v20.4s\n"
-    "fmla v8.4s, v25.4s, v20.4s\n"
-    "fmla v11.4s, v18.4s, v20.4s\n"
-    "fmla v16.4s, v30.4s, v20.4s\n"
-    "fmla v12.4s, v25.4s, v19.4s\n"
-    "fmla v8.4s, v30.4s, v19.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "fmla v8.4s, v29.4s, v6.4s\n"
-    "fmla v12.4s, v30.4s, v2.4s\n"
-    "fmla v12.4s, v29.4s, v4.4s\n"
-    "bne 2b\n"
-    "3:\n"
-    "mov v13.16b, v23.16b\n"
-    "ldr q21, [x24, x13]\n"
-    "mov v18.16b, v23.16b\n"
-    "prfm pldl1keep, [x24, x10]\n"
-    "fmla v11.4s, v29.4s, v19.4s\n"
-    "prfm pldl1keep, [%[inptr0], x21]\n"
-    "fmla v7.4s, v29.4s, v20.4s\n"
-    "ldr q25, [%[inptr0], x19]\n"
-    "fmla v12.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v11.4s, v24.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v10.4s, v24.4s, v20.4s\n"
-    "ldr q24, [x22]\n"
-    "fmla v8.4s, v27.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x10]\n"
-    "fmla v16.4s, v27.4s, v19.4s\n"
-    "prfm pldl1keep, [x24, x21]\n"
-    "fmla v14.4s, v27.4s, v20.4s\n"
-    "ldr q26, [x20, %[input_col_stride1]]\n"
-    "fmla v12.4s, v22.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v8.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v11.4s, v22.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x10]\n"
-    "fmla v7.4s, v22.4s, v19.4s\n"
-    "prfm pldl1keep, [x9, x21]\n"
-    "fmla v15.4s, v22.4s, v20.4s\n"
-    "ldr q30, [x9, x13]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x24, x7]\n"
-    "fmla v8.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x12]\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v7.4s, v21.4s, v6.4s\n"
-    "prfm pldl1keep, [x22, x10]\n"
-    "fmla v10.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [x20, x21]\n"
-    "fmla v17.4s, v21.4s, v20.4s\n"
-    "ldr q22, [x24, x19]\n"
-    "fmla v11.4s, v25.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v10.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x24, x12]\n"
-    "fmla v9.4s, v25.4s, v20.4s\n"
-    "ldr q21, [%[inptr0], x17]\n"
-    "fmla v16.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x10]\n"
-    "fmla v14.4s, v24.4s, v19.4s\n"
-    "ldr q24, [x23]\n"
-    "fmla v8.4s, v26.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x21]\n"
-    "fmla v16.4s, v26.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x7]\n"
-    "fmla v7.4s, v26.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x12]\n"
-    "fmla v14.4s, v26.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x21]\n"
-    "fmla v15.4s, v26.4s, v19.4s\n"
-    "prfm pldl1keep, [x22, x7]\n"
-    "fmla v13.4s, v26.4s, v20.4s\n"
-    "ldr q26, [x22, %[input_col_stride1]]\n"
-    "fmla v12.4s, v30.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, x12]\n"
-    "fmla v8.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x23, x7]\n"
-    "fmla v11.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x12]\n"
-    "fmla v16.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x23, x12]\n"
-    "fmla v7.4s, v30.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #160\n"
-    "fmla v10.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v19.4s\n"
-    "fmla v18.4s, v30.4s, v20.4s\n"
-    "ldr q27, [x20, x13]\n"
-    "fmla v11.4s, v22.4s, v3.4s\n"
-    "fmla v7.4s, v22.4s, v5.4s\n"
-    "fmla v10.4s, v22.4s, v4.4s\n"
-    "fmla v17.4s, v22.4s, v6.4s\n"
-    "fmla v9.4s, v22.4s, v19.4s\n"
-    "fmla v14.4s, v24.4s, v2.4s\n"
-    "mov v25.16b, v23.16b\n"
-    "fmla v16.4s, v26.4s, v1.4s\n"
-    "fmla v10.4s, v21.4s, v5.4s\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v25.4s, v22.4s, v20.4s\n"
-    "ldr q28, [x9, x19]\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "ldr q29, [x24, x17]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "fmla v13.4s, v26.4s, v19.4s\n"
-    "mov v22.16b, v23.16b\n"
-    "fmla v8.4s, v27.4s, v0.4s\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v7.4s, v27.4s, v1.4s\n"
-    "fmla v14.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v18.4s, v27.4s, v19.4s\n"
-    "fmla v22.4s, v27.4s, v20.4s\n"
-    "mov v24.16b, v23.16b\n"
-    "mov v21.16b, v23.16b\n"
-    "fmla v11.4s, v28.4s, v0.4s\n"
-    "fmla v7.4s, v28.4s, v3.4s\n"
-    "fmla v10.4s, v28.4s, v1.4s\n"
-    "fmla v15.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v28.4s, v4.4s\n"
-    "fmla v9.4s, v28.4s, v2.4s\n"
-    "fmla v18.4s, v28.4s, v6.4s\n"
-    "fmla v25.4s, v28.4s, v19.4s\n"
-    "fmla v24.4s, v28.4s, v20.4s\n"
-    "ldr q23, [%[inptr0], x11]\n"
-    "fmla v10.4s, v29.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #16\n"
-    "fmla v17.4s, v29.4s, v5.4s\n"
-    "fmla v9.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v6.4s\n"
-    "ldr q30, [x23, %[input_col_stride1]]\n"
-    "fmla v14.4s, v30.4s, v1.4s\n"
-    "fmla v13.4s, v30.4s, v2.4s\n"
-    "fmla v9.4s, v23.4s, v5.4s\n"
-    "ldr q23, [x22, x13]\n"
-    "fmla v16.4s, v23.4s, v0.4s\n"
-    "ldr q29, [x20, x19]\n"
-    "fmla v14.4s, v23.4s, v3.4s\n"
-    "fmla v15.4s, v23.4s, v1.4s\n"
-    "fmla v13.4s, v23.4s, v4.4s\n"
-    "fmla v18.4s, v23.4s, v2.4s\n"
-    "fmla v22.4s, v23.4s, v19.4s\n"
-    "ldr q23, [x9, x17]\n"
-    "fmla v7.4s, v29.4s, v0.4s\n"
-    "fmla v15.4s, v29.4s, v3.4s\n"
-    "fmla v17.4s, v29.4s, v1.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v2.4s\n"
-    "fmla v22.4s, v29.4s, v6.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v21.4s, v29.4s, v20.4s\n"
-    "ldr q26, [x24, x11]\n"
-    "fmla v10.4s, v23.4s, v0.4s\n"
-    "ldr q28, [x23, x13]\n"
-    "fmla v17.4s, v23.4s, v3.4s\n"
-    "add x24, x24, #16\n"
-    "fmla v9.4s, v23.4s, v1.4s\n"
-    "fmla v18.4s, v23.4s, v5.4s\n"
-    "fmla v25.4s, v23.4s, v4.4s\n"
-    "fmla v24.4s, v23.4s, v6.4s\n"
-    "fmla v14.4s, v28.4s, v0.4s\n"
-    "ldr q20, [x22, x19]\n"
-    "fmla v9.4s, v26.4s, v3.4s\n"
-    "fmla v13.4s, v28.4s, v1.4s\n"
-    "fmla v25.4s, v26.4s, v5.4s\n"
-    "ldr q26, [x20, x17]\n"
-    "fmla v22.4s, v28.4s, v2.4s\n"
-    "ldr q23, [x9, x11]\n"
-    "fmla v15.4s, v20.4s, v0.4s\n"
-    "add x9, x9, #16\n"
-    "fmla v13.4s, v20.4s, v3.4s\n"
-    "fmla v18.4s, v20.4s, v1.4s\n"
-    "fmla v22.4s, v20.4s, v4.4s\n"
-    "fmla v24.4s, v20.4s, v2.4s\n"
-    "fmla v21.4s, v20.4s, v19.4s\n"
-    "ldr q27, [x23, x19]\n"
-    "fmla v17.4s, v26.4s, v0.4s\n"
-    "ldr q20, [x22, x17]\n"
-    "fmla v18.4s, v26.4s, v3.4s\n"
-    "fmla v25.4s, v26.4s, v1.4s\n"
-    "fmla v22.4s, v26.4s, v5.4s\n"
-    "fmla v24.4s, v26.4s, v4.4s\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr q19, [x20, x11]\n"
-    "fmla v9.4s, v23.4s, v0.4s\n"
-    "ldr q28, [x23, x17]\n"
-    "fmla v25.4s, v23.4s, v3.4s\n"
-    "add x20, x20, #16\n"
-    "fmla v24.4s, v23.4s, v5.4s\n"
-    "ldr q29, [x22, x11]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "add x22, x22, #16\n"
-    "fmla v22.4s, v27.4s, v1.4s\n"
-    "fmla v21.4s, v27.4s, v2.4s\n"
-    "fmla v18.4s, v20.4s, v0.4s\n"
-    "ldr q30, [x23, x11]\n"
-    "fmla v24.4s, v20.4s, v1.4s\n"
-    "add x23, x23, #16\n"
-    "fmla v22.4s, v20.4s, v3.4s\n"
-    "fmla v21.4s, v20.4s, v4.4s\n"
-    "fmla v25.4s, v19.4s, v0.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v24.4s, v19.4s, v3.4s\n"
-    "fmov v27.4s, #6.0\n"
-    "fmla v21.4s, v19.4s, v5.4s\n"
-    "fmla v22.4s, v28.4s, v0.4s\n"
-    "fmax v12.4s, v12.4s, v26.4s\n"
-    "fmax v11.4s, v11.4s, v26.4s\n"
-    "fmla v24.4s, v29.4s, v0.4s\n"
-    "fmax v10.4s, v10.4s, v26.4s\n"
-    "fmla v21.4s, v28.4s, v1.4s\n"
-    "fmin v12.4s, v12.4s, v27.4s\n"
-    "fmin v11.4s, v11.4s, v27.4s\n"
-    "fmin v10.4s, v10.4s, v27.4s\n"
-    "str q12, [%[outptr0]]\n"
-    "fmax v9.4s, v9.4s, v26.4s\n"
-    "str q11, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v21.4s, v29.4s, v3.4s\n"
-    "str q10, [%[outptr0], x27]\n"
-    "fmin v9.4s, v9.4s, v27.4s\n"
-    "fmax v8.4s, v8.4s, v26.4s\n"
-    "fmax v7.4s, v7.4s, v26.4s\n"
-    "str q9, [%[outptr0], x28]\n"
-    "fmla v21.4s, v30.4s, v0.4s\n"
-    "fmin v8.4s, v8.4s, v27.4s\n"
-    "add %[outptr0], %[outptr0], #16\n"
-    "fmin v7.4s, v7.4s, v27.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "str q8, [x8]\n"
-    "fmax v25.4s, v25.4s, v26.4s\n"
-    "str q7, [x8, %[output_col_stride1]]\n"
-    "fmin v17.4s, v17.4s, v27.4s\n"
-    "fmin v25.4s, v25.4s, v27.4s\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str q17, [x8, x27]\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "str q25, [x8, x28]\n"
-    "fmin v16.4s, v16.4s, v27.4s\n"
-    "fmin v15.4s, v15.4s, v27.4s\n"
-    "add x8, x8, #16\n"
-    "str q16, [x25]\n"
-    "fmax v18.4s, v18.4s, v26.4s\n"
-    "str q15, [x25, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v26.4s\n"
-    "fmin v18.4s, v18.4s, v27.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "fmin v24.4s, v24.4s, v27.4s\n"
-    "fmax v13.4s, v13.4s, v26.4s\n"
-    "str q18, [x25, x27]\n"
-    "fmin v14.4s, v14.4s, v27.4s\n"
-    "str q24, [x25, x28]\n"
-    "fmin v13.4s, v13.4s, v27.4s\n"
-    "str q14, [x26]\n"
-    "fmax v22.4s, v22.4s, v26.4s\n"
-    "str q13, [x26, %[output_col_stride1]]\n"
-    "fmax v21.4s, v21.4s, v26.4s\n"
-    "fmin v22.4s, v22.4s, v27.4s\n"
-    "add x25, x25, #16\n"
-    "fmin v21.4s, v21.4s, v27.4s\n"
-    "str q22, [x26, x27]\n"
-    "str q21, [x26, x28]\n"
-    "add x26, x26, #16\n"
-    "4:\n"
-    "cbz x14, 7f\n"
-    "ldr s23, [%[wbptr]]\n"
-    "mov v12.16b, v23.16b\n"
-    "ldr s20, [%[wbptr], #4]\n"
-    "mov v8.16b, v23.16b\n"
-    "ldr s6, [%[wbptr], #8]\n"
-    "mov v11.16b, v23.16b\n"
-    "ldr s5, [%[wbptr], #12]\n"
-    "mov v16.16b, v23.16b\n"
-    "ldr s19, [%[wbptr], #16]\n"
-    "mov v7.16b, v23.16b\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "mov v10.16b, v23.16b\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "mov v14.16b, v23.16b\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "mov v15.16b, v23.16b\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "mov v17.16b, v23.16b\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "mov v9.16b, v23.16b\n"
-    "ldr s28, [%[inptr0]]\n"
-    "fmla v12.4s, v28.4s, v20.4s\n"
-    "ldr s25, [x24]\n"
-    "fmla v8.4s, v25.4s, v20.4s\n"
-    "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
-    "fmla v11.4s, v18.4s, v20.4s\n"
-    "ldr s30, [x9]\n"
-    "fmla v12.4s, v25.4s, v19.4s\n"
-    "ldr s29, [x24, %[input_col_stride1]]\n"
-    "fmla v8.4s, v30.4s, v19.4s\n"
-    "ldr s24, [%[inptr0], x13]\n"
-    "fmla v16.4s, v30.4s, v20.4s\n"
-    "ldr s27, [x20]\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "ldr s22, [x9, %[input_col_stride1]]\n"
-    "fmla v8.4s, v29.4s, v6.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "subs x14, x14, #1\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v12.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [x24, x16]\n"
-    "prfm pldl1keep, [%[inptr0], x10]\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v12.4s, v29.4s, v4.4s\n"
-    "beq 6f\n"
-    "5:\n"
-    "mov v13.16b, v23.16b\n"
-    "ldr s21, [x24, x13]\n"
-    "mov v18.16b, v23.16b\n"
-    "prfm pldl1keep, [x24, x10]\n"
-    "fmla v11.4s, v29.4s, v19.4s\n"
-    "prfm pldl1keep, [%[inptr0], x21]\n"
-    "fmla v7.4s, v29.4s, v20.4s\n"
-    "ldr s25, [%[inptr0], x19]\n"
-    "fmla v12.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v11.4s, v24.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v10.4s, v24.4s, v20.4s\n"
-    "ldr s24, [x22]\n"
-    "fmla v8.4s, v27.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x10]\n"
-    "fmla v16.4s, v27.4s, v19.4s\n"
-    "prfm pldl1keep, [x24, x21]\n"
-    "fmla v14.4s, v27.4s, v20.4s\n"
-    "ldr s26, [x20, %[input_col_stride1]]\n"
-    "fmla v12.4s, v22.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v8.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v11.4s, v22.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x10]\n"
-    "fmla v7.4s, v22.4s, v19.4s\n"
-    "prfm pldl1keep, [x9, x21]\n"
-    "fmla v15.4s, v22.4s, v20.4s\n"
-    "ldr s30, [x9, x13]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x24, x7]\n"
-    "fmla v8.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x12]\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v7.4s, v21.4s, v6.4s\n"
-    "prfm pldl1keep, [x22, x10]\n"
-    "fmla v10.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [x20, x21]\n"
-    "fmla v17.4s, v21.4s, v20.4s\n"
-    "ldr s22, [x24, x19]\n"
-    "fmla v11.4s, v25.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v10.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x24, x12]\n"
-    "fmla v9.4s, v25.4s, v20.4s\n"
-    "ldr s21, [%[inptr0], x17]\n"
-    "fmla v16.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x10]\n"
-    "fmla v14.4s, v24.4s, v19.4s\n"
-    "ldr s24, [x23]\n"
-    "fmla v8.4s, v26.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x21]\n"
-    "fmla v16.4s, v26.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x7]\n"
-    "fmla v7.4s, v26.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x12]\n"
-    "fmla v14.4s, v26.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x21]\n"
-    "fmla v15.4s, v26.4s, v19.4s\n"
-    "prfm pldl1keep, [x22, x7]\n"
-    "fmla v13.4s, v26.4s, v20.4s\n"
-    "ldr s26, [x22, %[input_col_stride1]]\n"
-    "fmla v12.4s, v30.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, x12]\n"
-    "fmla v8.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x23, x7]\n"
-    "fmla v11.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x12]\n"
-    "fmla v16.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x23, x12]\n"
-    "fmla v7.4s, v30.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v10.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "subs x14, x14, #1\n"
-    "fmla v17.4s, v30.4s, v19.4s\n"
-    "fmla v18.4s, v30.4s, v20.4s\n"
-    "mov v25.16b, v23.16b\n"
-    "fmla v11.4s, v22.4s, v3.4s\n"
-    "fmla v7.4s, v22.4s, v5.4s\n"
-    "fmla v10.4s, v22.4s, v4.4s\n"
-    "fmla v17.4s, v22.4s, v6.4s\n"
-    "fmla v9.4s, v22.4s, v19.4s\n"
-    "fmla v25.4s, v22.4s, v20.4s\n"
-    "ldr s27, [x20, x13]\n"
-    "fmla v10.4s, v21.4s, v5.4s\n"
-    "fmla v14.4s, v24.4s, v2.4s\n"
-    "mov v22.16b, v23.16b\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "mov v24.16b, v23.16b\n"
-    "mov v21.16b, v23.16b\n"
-    "fmla v16.4s, v26.4s, v1.4s\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v13.4s, v26.4s, v19.4s\n"
-    "fmla v8.4s, v27.4s, v0.4s\n"
-    "ldr s28, [x9, x19]\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v7.4s, v27.4s, v1.4s\n"
-    "fmla v14.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v18.4s, v27.4s, v19.4s\n"
-    "fmla v22.4s, v27.4s, v20.4s\n"
-    "fmla v11.4s, v28.4s, v0.4s\n"
-    "ldr s29, [x24, x17]\n"
-    "fmla v7.4s, v28.4s, v3.4s\n"
-    "fmla v10.4s, v28.4s, v1.4s\n"
-    "fmla v15.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v28.4s, v4.4s\n"
-    "fmla v9.4s, v28.4s, v2.4s\n"
-    "fmla v18.4s, v28.4s, v6.4s\n"
-    "fmla v25.4s, v28.4s, v19.4s\n"
-    "fmla v24.4s, v28.4s, v20.4s\n"
-    "fmla v10.4s, v29.4s, v3.4s\n"
-    "ldr s23, [%[inptr0], x11]\n"
-    "fmla v17.4s, v29.4s, v5.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v9.4s, v29.4s, v4.4s\n"
-    "prfm pldl1keep, [%[inptr0], #64]\n"
-    "fmla v25.4s, v29.4s, v6.4s\n"
-    "ldr s30, [x23, %[input_col_stride1]]\n"
-    "fmla v14.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x16]\n"
-    "fmla v9.4s, v23.4s, v5.4s\n"
-    "ldr s23, [x22, x13]\n"
-    "fmla v13.4s, v30.4s, v2.4s\n"
-    "ldr s29, [x20, x19]\n"
-    "fmla v16.4s, v23.4s, v0.4s\n"
-    "prfm pldl1keep, [%[inptr0], x10]\n"
-    "fmla v14.4s, v23.4s, v3.4s\n"
-    "fmla v15.4s, v23.4s, v1.4s\n"
-    "fmla v13.4s, v23.4s, v4.4s\n"
-    "fmla v18.4s, v23.4s, v2.4s\n"
-    "fmla v22.4s, v23.4s, v19.4s\n"
-    "ldr s23, [x9, x17]\n"
-    "fmla v7.4s, v29.4s, v0.4s\n"
-    "fmla v15.4s, v29.4s, v3.4s\n"
-    "fmla v17.4s, v29.4s, v1.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v2.4s\n"
-    "fmla v22.4s, v29.4s, v6.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v21.4s, v29.4s, v20.4s\n"
-    "ldr s26, [x24, x11]\n"
-    "fmla v10.4s, v23.4s, v0.4s\n"
-    "ldr s28, [x23, x13]\n"
-    "fmla v17.4s, v23.4s, v3.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v9.4s, v23.4s, v1.4s\n"
-    "prfm pldl1keep, [x24, #64]\n"
-    "fmla v18.4s, v23.4s, v5.4s\n"
-    "prfm pldl1keep, [x24, x16]\n"
-    "fmla v25.4s, v23.4s, v4.4s\n"
-    "fmla v24.4s, v23.4s, v6.4s\n"
-    "fmla v9.4s, v26.4s, v3.4s\n"
-    "ldr s20, [x22, x19]\n"
-    "fmla v14.4s, v28.4s, v0.4s\n"
-    "fmla v13.4s, v28.4s, v1.4s\n"
-    "fmla v25.4s, v26.4s, v5.4s\n"
-    "ldr s26, [x20, x17]\n"
-    "fmla v22.4s, v28.4s, v2.4s\n"
-    "ldr s23, [x9, x11]\n"
-    "fmla v15.4s, v20.4s, v0.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v13.4s, v20.4s, v3.4s\n"
-    "prfm pldl1keep, [x9, #64]\n"
-    "fmla v18.4s, v20.4s, v1.4s\n"
-    "prfm pldl1keep, [x9, x16]\n"
-    "fmla v22.4s, v20.4s, v4.4s\n"
-    "fmla v24.4s, v20.4s, v2.4s\n"
-    "fmla v21.4s, v20.4s, v19.4s\n"
-    "ldr s27, [x23, x19]\n"
-    "fmla v17.4s, v26.4s, v0.4s\n"
-    "ldr s20, [x22, x17]\n"
-    "fmla v18.4s, v26.4s, v3.4s\n"
-    "fmla v25.4s, v26.4s, v1.4s\n"
-    "fmla v22.4s, v26.4s, v5.4s\n"
-    "fmla v24.4s, v26.4s, v4.4s\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr s19, [x20, x11]\n"
-    "fmla v9.4s, v23.4s, v0.4s\n"
-    "ldr s28, [x23, x17]\n"
-    "fmla v25.4s, v23.4s, v3.4s\n"
-    "add x20, x20, #4\n"
-    "fmla v24.4s, v23.4s, v5.4s\n"
-    "ldr s29, [x22, x11]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, #64]\n"
-    "fmla v22.4s, v27.4s, v1.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v21.4s, v27.4s, v2.4s\n"
-    "ldr s30, [x23, x11]\n"
-    "fmla v18.4s, v20.4s, v0.4s\n"
-    "ldr s23, [%[wbptr]]\n"
-    "fmla v22.4s, v20.4s, v3.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v24.4s, v20.4s, v1.4s\n"
-    "fmla v21.4s, v20.4s, v4.4s\n"
-    "fmla v25.4s, v19.4s, v0.4s\n"
-    "ldr s20, [%[wbptr], #4]\n"
-    "fmla v22.4s, v28.4s, v0.4s\n"
-    "ldr s6, [%[wbptr], #8]\n"
-    "fmla v21.4s, v19.4s, v5.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v24.4s, v19.4s, v3.4s\n"
-    "ldr s19, [%[wbptr], #16]\n"
-    "fmax v12.4s, v12.4s, v26.4s\n"
-    "fmax v11.4s, v11.4s, v26.4s\n"
-    "fmla v21.4s, v28.4s, v1.4s\n"
-    "ldr s5, [%[wbptr], #12]\n"
-    "fmla v24.4s, v29.4s, v0.4s\n"
-    "ldr s4, [%[wbptr], #20]\n"
-    "fmax v10.4s, v10.4s, v26.4s\n"
-    "fmax v9.4s, v9.4s, v26.4s\n"
-    "fmla v21.4s, v29.4s, v3.4s\n"
-    "ldr s2, [%[wbptr], #28]\n"
-    "fmov v27.4s, #6.0\n"
-    "fmax v8.4s, v8.4s, v26.4s\n"
-    "fmax v7.4s, v7.4s, v26.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "fmla v21.4s, v30.4s, v0.4s\n"
-    "ldr s3, [%[wbptr], #24]\n"
-    "fmin v12.4s, v12.4s, v27.4s\n"
-    "ldr s1, [%[wbptr], #32]\n"
-    "fmin v11.4s, v11.4s, v27.4s\n"
-    "fmin v10.4s, v10.4s, v27.4s\n"
-    "str s12, [%[outptr0]]\n"
-    "fmin v9.4s, v9.4s, v27.4s\n"
-    "str s11, [%[outptr0], %[output_col_stride1]]\n"
-    "fmin v8.4s, v8.4s, v27.4s\n"
-    "str s10, [%[outptr0], x27]\n"
-    "fmin v7.4s, v7.4s, v27.4s\n"
-    "str s9, [%[outptr0], x28]\n"
-    "fmin v17.4s, v17.4s, v27.4s\n"
-    "str s8, [x8]\n"
-    "fmax v25.4s, v25.4s, v26.4s\n"
-    "str s7, [x8, %[output_col_stride1]]\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str s17, [x8, x27]\n"
-    "fmin v25.4s, v25.4s, v27.4s\n"
-    "fmin v16.4s, v16.4s, v27.4s\n"
-    "ldr s0, [%[wbptr], #36]\n"
-    "str s25, [x8, x28]\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "str s16, [x25]\n"
-    "fmax v18.4s, v18.4s, v26.4s\n"
-    "fmin v15.4s, v15.4s, v27.4s\n"
-    "ldr s28, [%[inptr0]]\n"
-    "fmin v18.4s, v18.4s, v27.4s\n"
-    "ldr s25, [x24]\n"
-    "str s15, [x25, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v26.4s\n"
-    "str s18, [x25, x27]\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "fmin v24.4s, v24.4s, v27.4s\n"
-    "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
-    "fmin v14.4s, v14.4s, v27.4s\n"
-    "ldr s30, [x9]\n"
-    "str s24, [x25, x28]\n"
-    "fmax v13.4s, v13.4s, v26.4s\n"
-    "str s14, [x26]\n"
-    "fmax v22.4s, v22.4s, v26.4s\n"
-    "fmin v13.4s, v13.4s, v27.4s\n"
-    "ldr s29, [x24, %[input_col_stride1]]\n"
-    "fmin v22.4s, v22.4s, v27.4s\n"
-    "ldr s24, [%[inptr0], x13]\n"
-    "str s13, [x26, %[output_col_stride1]]\n"
-    "fmax v21.4s, v21.4s, v26.4s\n"
-    "str s22, [x26, x27]\n"
-    "mov v12.16b, v23.16b\n"
-    "fmin v21.4s, v21.4s, v27.4s\n"
-    "ldr s27, [x20]\n"
-    "mov v8.16b, v23.16b\n"
-    "ldr s22, [x9, %[input_col_stride1]]\n"
-    "str s21, [x26, x28]\n"
-    "mov v11.16b, v23.16b\n"
-    "mov v16.16b, v23.16b\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "mov v7.16b, v23.16b\n"
-    "add x8, x8, #4\n"
-    "mov v10.16b, v23.16b\n"
-    "add x25, x25, #4\n"
-    "mov v14.16b, v23.16b\n"
-    "add x26, x26, #4\n"
-    "mov v15.16b, v23.16b\n"
-    "mov v17.16b, v23.16b\n"
-    "mov v9.16b, v23.16b\n"
-    "fmla v12.4s, v28.4s, v20.4s\n"
-    "fmla v8.4s, v25.4s, v20.4s\n"
-    "fmla v11.4s, v18.4s, v20.4s\n"
-    "fmla v16.4s, v30.4s, v20.4s\n"
-    "fmla v12.4s, v25.4s, v19.4s\n"
-    "fmla v8.4s, v30.4s, v19.4s\n"
-    "fmla v12.4s, v18.4s, v6.4s\n"
-    "fmla v8.4s, v29.4s, v6.4s\n"
-    "fmla v12.4s, v30.4s, v2.4s\n"
-    "fmla v12.4s, v29.4s, v4.4s\n"
-    "bne 5b\n"
-    "6:\n"
-    "mov v13.16b, v23.16b\n"
-    "ldr s21, [x24, x13]\n"
-    "mov v18.16b, v23.16b\n"
-    "prfm pldl1keep, [x24, x10]\n"
-    "fmla v11.4s, v29.4s, v19.4s\n"
-    "prfm pldl1keep, [%[inptr0], x21]\n"
-    "fmla v7.4s, v29.4s, v20.4s\n"
-    "ldr s25, [%[inptr0], x19]\n"
-    "fmla v12.4s, v24.4s, v5.4s\n"
-    "prfm pldl1keep, [x22, #64]\n"
-    "fmla v11.4s, v24.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x16]\n"
-    "fmla v10.4s, v24.4s, v20.4s\n"
-    "ldr s24, [x22]\n"
-    "fmla v8.4s, v27.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x10]\n"
-    "fmla v16.4s, v27.4s, v19.4s\n"
-    "prfm pldl1keep, [x24, x21]\n"
-    "fmla v14.4s, v27.4s, v20.4s\n"
-    "ldr s26, [x20, %[input_col_stride1]]\n"
-    "fmla v12.4s, v22.4s, v1.4s\n"
-    "prfm pldl1keep, [%[inptr0], x7]\n"
-    "fmla v8.4s, v22.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, #64]\n"
-    "fmla v11.4s, v22.4s, v2.4s\n"
-    "prfm pldl1keep, [x22, x16]\n"
-    "fmla v16.4s, v22.4s, v6.4s\n"
-    "prfm pldl1keep, [x20, x10]\n"
-    "fmla v7.4s, v22.4s, v19.4s\n"
-    "prfm pldl1keep, [x9, x21]\n"
-    "fmla v15.4s, v22.4s, v20.4s\n"
-    "ldr s30, [x9, x13]\n"
-    "fmla v12.4s, v21.4s, v3.4s\n"
-    "prfm pldl1keep, [x24, x7]\n"
-    "fmla v8.4s, v21.4s, v5.4s\n"
-    "prfm pldl1keep, [%[inptr0], x12]\n"
-    "fmla v11.4s, v21.4s, v4.4s\n"
-    "prfm pldl1keep, [x23, x16]\n"
-    "fmla v7.4s, v21.4s, v6.4s\n"
-    "prfm pldl1keep, [x22, x10]\n"
-    "fmla v10.4s, v21.4s, v19.4s\n"
-    "prfm pldl1keep, [x20, x21]\n"
-    "fmla v17.4s, v21.4s, v20.4s\n"
-    "ldr s22, [x24, x19]\n"
-    "fmla v11.4s, v25.4s, v5.4s\n"
-    "prfm pldl1keep, [x9, x7]\n"
-    "fmla v10.4s, v25.4s, v6.4s\n"
-    "prfm pldl1keep, [x24, x12]\n"
-    "fmla v9.4s, v25.4s, v20.4s\n"
-    "ldr s21, [%[inptr0], x17]\n"
-    "fmla v16.4s, v24.4s, v2.4s\n"
-    "prfm pldl1keep, [x23, x10]\n"
-    "fmla v14.4s, v24.4s, v19.4s\n"
-    "ldr s24, [x23]\n"
-    "fmla v8.4s, v26.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x21]\n"
-    "fmla v16.4s, v26.4s, v4.4s\n"
-    "prfm pldl1keep, [x20, x7]\n"
-    "fmla v7.4s, v26.4s, v2.4s\n"
-    "prfm pldl1keep, [x9, x12]\n"
-    "fmla v14.4s, v26.4s, v6.4s\n"
-    "prfm pldl1keep, [x23, x21]\n"
-    "fmla v15.4s, v26.4s, v19.4s\n"
-    "prfm pldl1keep, [x22, x7]\n"
-    "fmla v13.4s, v26.4s, v20.4s\n"
-    "ldr s26, [x22, %[input_col_stride1]]\n"
-    "fmla v12.4s, v30.4s, v0.4s\n"
-    "prfm pldl1keep, [x20, x12]\n"
-    "fmla v8.4s, v30.4s, v3.4s\n"
-    "prfm pldl1keep, [x23, x7]\n"
-    "fmla v11.4s, v30.4s, v1.4s\n"
-    "prfm pldl1keep, [x22, x12]\n"
-    "fmla v16.4s, v30.4s, v5.4s\n"
-    "prfm pldl1keep, [x23, x12]\n"
-    "fmla v7.4s, v30.4s, v4.4s\n"
-    "add %[wbptr], %[wbptr], #40\n"
-    "fmla v10.4s, v30.4s, v2.4s\n"
-    "prfm pldl1keep, [%[wbptr], #64]\n"
-    "fmla v15.4s, v30.4s, v6.4s\n"
-    "fmla v17.4s, v30.4s, v19.4s\n"
-    "fmla v18.4s, v30.4s, v20.4s\n"
-    "ldr s27, [x20, x13]\n"
-    "fmla v11.4s, v22.4s, v3.4s\n"
-    "fmla v7.4s, v22.4s, v5.4s\n"
-    "fmla v10.4s, v22.4s, v4.4s\n"
-    "fmla v17.4s, v22.4s, v6.4s\n"
-    "fmla v9.4s, v22.4s, v19.4s\n"
-    "fmla v14.4s, v24.4s, v2.4s\n"
-    "mov v25.16b, v23.16b\n"
-    "fmla v16.4s, v26.4s, v1.4s\n"
-    "fmla v10.4s, v21.4s, v5.4s\n"
-    "fmla v15.4s, v26.4s, v2.4s\n"
-    "fmla v25.4s, v22.4s, v20.4s\n"
-    "ldr s28, [x9, x19]\n"
-    "fmla v9.4s, v21.4s, v6.4s\n"
-    "ldr s29, [x24, x17]\n"
-    "fmla v14.4s, v26.4s, v4.4s\n"
-    "fmla v13.4s, v26.4s, v19.4s\n"
-    "mov v22.16b, v23.16b\n"
-    "fmla v8.4s, v27.4s, v0.4s\n"
-    "fmla v16.4s, v27.4s, v3.4s\n"
-    "fmla v7.4s, v27.4s, v1.4s\n"
-    "fmla v14.4s, v27.4s, v5.4s\n"
-    "fmla v15.4s, v27.4s, v4.4s\n"
-    "fmla v17.4s, v27.4s, v2.4s\n"
-    "fmla v13.4s, v27.4s, v6.4s\n"
-    "fmla v18.4s, v27.4s, v19.4s\n"
-    "fmla v22.4s, v27.4s, v20.4s\n"
-    "mov v24.16b, v23.16b\n"
-    "mov v21.16b, v23.16b\n"
-    "fmla v11.4s, v28.4s, v0.4s\n"
-    "fmla v7.4s, v28.4s, v3.4s\n"
-    "fmla v10.4s, v28.4s, v1.4s\n"
-    "fmla v15.4s, v28.4s, v5.4s\n"
-    "fmla v17.4s, v28.4s, v4.4s\n"
-    "fmla v9.4s, v28.4s, v2.4s\n"
-    "fmla v18.4s, v28.4s, v6.4s\n"
-    "fmla v25.4s, v28.4s, v19.4s\n"
-    "fmla v24.4s, v28.4s, v20.4s\n"
-    "ldr s23, [%[inptr0], x11]\n"
-    "fmla v10.4s, v29.4s, v3.4s\n"
-    "add %[inptr0], %[inptr0], #4\n"
-    "fmla v17.4s, v29.4s, v5.4s\n"
-    "fmla v9.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v6.4s\n"
-    "ldr s30, [x23, %[input_col_stride1]]\n"
-    "fmla v14.4s, v30.4s, v1.4s\n"
-    "fmla v13.4s, v30.4s, v2.4s\n"
-    "fmla v9.4s, v23.4s, v5.4s\n"
-    "ldr s23, [x22, x13]\n"
-    "fmla v16.4s, v23.4s, v0.4s\n"
-    "ldr s29, [x20, x19]\n"
-    "fmla v14.4s, v23.4s, v3.4s\n"
-    "fmla v15.4s, v23.4s, v1.4s\n"
-    "fmla v13.4s, v23.4s, v4.4s\n"
-    "fmla v18.4s, v23.4s, v2.4s\n"
-    "fmla v22.4s, v23.4s, v19.4s\n"
-    "ldr s23, [x9, x17]\n"
-    "fmla v7.4s, v29.4s, v0.4s\n"
-    "fmla v15.4s, v29.4s, v3.4s\n"
-    "fmla v17.4s, v29.4s, v1.4s\n"
-    "fmla v13.4s, v29.4s, v5.4s\n"
-    "fmla v18.4s, v29.4s, v4.4s\n"
-    "fmla v25.4s, v29.4s, v2.4s\n"
-    "fmla v22.4s, v29.4s, v6.4s\n"
-    "fmla v24.4s, v29.4s, v19.4s\n"
-    "fmla v21.4s, v29.4s, v20.4s\n"
-    "ldr s26, [x24, x11]\n"
-    "fmla v10.4s, v23.4s, v0.4s\n"
-    "ldr s28, [x23, x13]\n"
-    "fmla v17.4s, v23.4s, v3.4s\n"
-    "add x24, x24, #4\n"
-    "fmla v9.4s, v23.4s, v1.4s\n"
-    "fmla v18.4s, v23.4s, v5.4s\n"
-    "fmla v25.4s, v23.4s, v4.4s\n"
-    "fmla v24.4s, v23.4s, v6.4s\n"
-    "fmla v14.4s, v28.4s, v0.4s\n"
-    "ldr s20, [x22, x19]\n"
-    "fmla v9.4s, v26.4s, v3.4s\n"
-    "fmla v13.4s, v28.4s, v1.4s\n"
-    "fmla v25.4s, v26.4s, v5.4s\n"
-    "ldr s26, [x20, x17]\n"
-    "fmla v22.4s, v28.4s, v2.4s\n"
-    "ldr s23, [x9, x11]\n"
-    "fmla v15.4s, v20.4s, v0.4s\n"
-    "add x9, x9, #4\n"
-    "fmla v13.4s, v20.4s, v3.4s\n"
-    "fmla v18.4s, v20.4s, v1.4s\n"
-    "fmla v22.4s, v20.4s, v4.4s\n"
-    "fmla v24.4s, v20.4s, v2.4s\n"
-    "fmla v21.4s, v20.4s, v19.4s\n"
-    "ldr s27, [x23, x19]\n"
-    "fmla v17.4s, v26.4s, v0.4s\n"
-    "ldr s20, [x22, x17]\n"
-    "fmla v18.4s, v26.4s, v3.4s\n"
-    "fmla v25.4s, v26.4s, v1.4s\n"
-    "fmla v22.4s, v26.4s, v5.4s\n"
-    "fmla v24.4s, v26.4s, v4.4s\n"
-    "fmla v21.4s, v26.4s, v6.4s\n"
-    "ldr s19, [x20, x11]\n"
-    "fmla v9.4s, v23.4s, v0.4s\n"
-    "ldr s28, [x23, x17]\n"
-    "fmla v25.4s, v23.4s, v3.4s\n"
-    "add x20, x20, #4\n"
-    "fmla v24.4s, v23.4s, v5.4s\n"
-    "ldr s29, [x22, x11]\n"
-    "fmla v13.4s, v27.4s, v0.4s\n"
-    "add x22, x22, #4\n"
-    "fmla v22.4s, v27.4s, v1.4s\n"
-    "fmla v21.4s, v27.4s, v2.4s\n"
-    "fmla v18.4s, v20.4s, v0.4s\n"
-    "ldr s30, [x23, x11]\n"
-    "fmla v24.4s, v20.4s, v1.4s\n"
-    "add x23, x23, #4\n"
-    "fmla v22.4s, v20.4s, v3.4s\n"
-    "fmla v21.4s, v20.4s, v4.4s\n"
-    "fmla v25.4s, v19.4s, v0.4s\n"
-    "movi v26.16b, #0\n"
-    "fmla v24.4s, v19.4s, v3.4s\n"
-    "fmov v27.4s, #6.0\n"
-    "fmla v21.4s, v19.4s, v5.4s\n"
-    "fmla v22.4s, v28.4s, v0.4s\n"
-    "fmax v12.4s, v12.4s, v26.4s\n"
-    "fmax v11.4s, v11.4s, v26.4s\n"
-    "fmla v24.4s, v29.4s, v0.4s\n"
-    "fmax v10.4s, v10.4s, v26.4s\n"
-    "fmla v21.4s, v28.4s, v1.4s\n"
-    "fmin v12.4s, v12.4s, v27.4s\n"
-    "fmin v11.4s, v11.4s, v27.4s\n"
-    "fmin v10.4s, v10.4s, v27.4s\n"
-    "str s12, [%[outptr0]]\n"
-    "fmax v9.4s, v9.4s, v26.4s\n"
-    "str s11, [%[outptr0], %[output_col_stride1]]\n"
-    "fmla v21.4s, v29.4s, v3.4s\n"
-    "str s10, [%[outptr0], x27]\n"
-    "fmin v9.4s, v9.4s, v27.4s\n"
-    "fmax v8.4s, v8.4s, v26.4s\n"
-    "fmax v7.4s, v7.4s, v26.4s\n"
-    "str s9, [%[outptr0], x28]\n"
-    "fmla v21.4s, v30.4s, v0.4s\n"
-    "fmin v8.4s, v8.4s, v27.4s\n"
-    "add %[outptr0], %[outptr0], #4\n"
-    "fmin v7.4s, v7.4s, v27.4s\n"
-    "fmax v17.4s, v17.4s, v26.4s\n"
-    "str s8, [x8]\n"
-    "fmax v25.4s, v25.4s, v26.4s\n"
-    "str s7, [x8, %[output_col_stride1]]\n"
-    "fmin v17.4s, v17.4s, v27.4s\n"
-    "fmin v25.4s, v25.4s, v27.4s\n"
-    "fmax v16.4s, v16.4s, v26.4s\n"
-    "str s17, [x8, x27]\n"
-    "fmax v15.4s, v15.4s, v26.4s\n"
-    "str s25, [x8, x28]\n"
-    "fmin v16.4s, v16.4s, v27.4s\n"
-    "fmin v15.4s, v15.4s, v27.4s\n"
-    "add x8, x8, #4\n"
-    "str s16, [x25]\n"
-    "fmax v18.4s, v18.4s, v26.4s\n"
-    "str s15, [x25, %[output_col_stride1]]\n"
-    "fmax v24.4s, v24.4s, v26.4s\n"
-    "fmin v18.4s, v18.4s, v27.4s\n"
-    "fmax v14.4s, v14.4s, v26.4s\n"
-    "fmin v24.4s, v24.4s, v27.4s\n"
-    "fmax v13.4s, v13.4s, v26.4s\n"
-    "str s18, [x25, x27]\n"
-    "fmin v14.4s, v14.4s, v27.4s\n"
-    "str s24, [x25, x28]\n"
-    "fmin v13.4s, v13.4s, v27.4s\n"
-    "str s14, [x26]\n"
-    "fmax v22.4s, v22.4s, v26.4s\n"
-    "str s13, [x26, %[output_col_stride1]]\n"
-    "fmax v21.4s, v21.4s, v26.4s\n"
-    "fmin v22.4s, v22.4s, v27.4s\n"
-    "add x25, x25, #4\n"
-    "fmin v21.4s, v21.4s, v27.4s\n"
-    "str s22, [x26, x27]\n"
-    "str s21, [x26, x28]\n"
-    "add x26, x26, #4\n"
-    "7:\n"
-    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
-    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
-    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
-  );
-}
-
-#endif  // __aarch64__
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
deleted file mode 100644
index 27bfb84..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
+++ /dev/null

@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "impl_dilated.hpp"
-
-template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>;
-template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
deleted file mode 100644
index 1bae815..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
+++ /dev/null

@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <deque>
-#include <functional>
-#include <memory>
-
-#include "depthwise.hpp"
-
-namespace depthwise
-{
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut
->
-class DilatedDepthwiseConvolution : public IDepthwiseConvolution
-{
-  public:
-    /** Create a new dilated depthwise convolution engine.
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    /** Create a new dilated depthwise convolution engine.
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    // Cannot copy or move a DilatedDepthwiseConvolution.
-    DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete;
-    DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete;
-
-    /* Set input tensor and stride. */
-    void set_input(const void *inptr) override;
-    void set_input(const void *inptr, int column_stride) override;
-    void set_input(const void *inptr, int row_stride, int column_stride) override;
-    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /* Set output tensor and stride. */
-    void set_output(void *outptr) override;
-    void set_output(void *outptr, int column_stride) override;
-    void set_output(void *outptr, int row_stride, int column_stride) override;
-    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
-    static int get_output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after,
-      int dilation_factor
-    );
-
-    int output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    ) const override;
-
-    /* Weights and biases are re-ordered to improve memory access patterns. Use
-     * these methods to determine the size of the re-pack buffer and to set the
-     * address (and implicitly reorder the weights and biases into) the buffer.
-     */
-    size_t get_packed_params_size(void) const override;
-    void set_packed_params_buffer(void *) override;
-
-    void pack_params(const void *weights, const void *biases=nullptr) const override;
-    void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override;
-    void pack_params(
-      void *buffer,
-      const void* weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const override;
-
-    /* Working space is used to pad tensors on the fly. Before running any
-     * inference check the amount of space required, allocate and provide a
-     * pointer to the convolution engine.
-     */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *) override;
-
-    unsigned int get_window(void) const override;
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    /** Protected constructor which also accepts a function to construct a new
-     * subconvolution
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right,
-      std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn
-    );
-
-    const int _dilation_factor;
-    const int _n_input_rows, _n_input_cols, _n_channels;
-    const int _padding_top, _padding_left;
-    const int _n_output_rows, _n_output_cols;
-
-    /* Dilated depthwise convolution is performed through repeated calls to
-     * non-dilated convolutions. If the dilation factor is $n$, then we perform
-     * $(n + 1)^2$ depthwise convolutions.
-     */
-    using BaseDepthwise = DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      TIn, TBias, TOut
-    >;
-    std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs;
-};
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
deleted file mode 100644
index e56583d..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
+++ /dev/null

@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "depthwise_quantized_dilated.hpp"
-#include "impl_dilated.hpp"
-
-namespace depthwise {
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                   KernelCols, StrideRows, StrideCols>::
-    QAsymm8DilatedDepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        int dilation_factor, nck::ActivationFunction activation,
-        const qasymm8::QAsymm8Params &weight_quantisation,
-        const qasymm8::QAsymm8Params &input_quantisation,
-        const qasymm8::QAsymm8Params &output_quantisation,
-        unsigned int padding_top, unsigned int padding_left,
-        unsigned int padding_bottom, unsigned int padding_right)
-    : QAsymm8DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          QAsymm8DilatedDepthwiseConvolution::get_output_size(
-              n_input_rows, padding_top, padding_bottom, dilation_factor),
-          QAsymm8DilatedDepthwiseConvolution::get_output_size(
-              n_input_cols, padding_left, padding_right, dilation_factor),
-          activation, weight_quantisation, input_quantisation,
-          output_quantisation, padding_top, padding_left, padding_bottom,
-          padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                   KernelCols, StrideRows, StrideCols>::
-    QAsymm8DilatedDepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        int dilation_factor, int n_output_rows, int n_output_cols,
-        nck::ActivationFunction activation,
-        const qasymm8::QAsymm8Params &weight_quantisation,
-        const qasymm8::QAsymm8Params &input_quantisation,
-        const qasymm8::QAsymm8Params &output_quantisation,
-        unsigned int padding_top, unsigned int padding_left,
-        unsigned int padding_bottom, unsigned int padding_right)
-    : QAsymm8DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          n_output_rows, n_output_cols, activation, weight_quantisation,
-          input_quantisation, output_quantisation,
-          qasymm8::QAsymm8RescaleParams::make_rescale_params(
-              weight_quantisation, input_quantisation, output_quantisation),
-          padding_top, padding_left, padding_bottom, padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                   KernelCols, StrideRows, StrideCols>::
-    QAsymm8DilatedDepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        int dilation_factor, nck::ActivationFunction activation,
-        const qasymm8::QAsymm8Params &weight_quantisation,
-        const qasymm8::QAsymm8Params &input_quantisation,
-        const qasymm8::QAsymm8Params &output_quantisation,
-        const qasymm8::QAsymm8RescaleParams &rescale_parameters,
-        unsigned int padding_top, unsigned int padding_left,
-        unsigned int padding_bottom, unsigned int padding_right)
-    : QAsymm8DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          QAsymm8DilatedDepthwiseConvolution::get_output_size(
-              n_input_rows, padding_top, padding_bottom, dilation_factor),
-          QAsymm8DilatedDepthwiseConvolution::get_output_size(
-              n_input_cols, padding_left, padding_right, dilation_factor),
-          activation, weight_quantisation, input_quantisation,
-          output_quantisation, rescale_parameters, padding_top, padding_left,
-          padding_bottom, padding_right) {}
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-QAsymm8DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                   KernelCols, StrideRows, StrideCols>::
-    QAsymm8DilatedDepthwiseConvolution(
-        int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-        int dilation_factor, int n_output_rows, int n_output_cols,
-        nck::ActivationFunction activation,
-        const qasymm8::QAsymm8Params &weight_quantisation,
-        const qasymm8::QAsymm8Params &input_quantisation,
-        const qasymm8::QAsymm8Params &output_quantisation,
-        const qasymm8::QAsymm8RescaleParams &rescale_parameters,
-        unsigned int padding_top, unsigned int padding_left,
-        unsigned int padding_bottom, unsigned int padding_right)
-    : DilatedDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows,
-                                  KernelCols, StrideRows, StrideCols, uint8_t,
-                                  int32_t, uint8_t>(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          n_output_rows, n_output_cols, activation, padding_top, padding_left,
-          padding_bottom, padding_right,
-          [weight_quantisation, input_quantisation, output_quantisation,
-           rescale_parameters](
-              const int n_batches, const int n_input_rows,
-              const int n_input_cols, const int n_channels,
-              const int n_output_rows, const int n_output_cols,
-              const nck::ActivationFunction activation,
-              const unsigned int padding_top, const unsigned int padding_left,
-              const unsigned int padding_bottom,
-              const unsigned int padding_right) -> IDepthwiseConvolution * {
-            return new QAsymm8DepthwiseConvolution<
-                OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                StrideRows, StrideCols>(
-                n_batches, n_input_rows, n_input_cols, n_channels,
-                n_output_rows, n_output_cols, activation, weight_quantisation,
-                input_quantisation, output_quantisation, rescale_parameters,
-                padding_top, padding_left, padding_bottom, padding_right);
-          }) {}
-
-} // namespace depthwise
-
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>;

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp
deleted file mode 100644
index 99f0f53..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp
+++ /dev/null

@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp16_fp16.hpp"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace depthwise
-{
-template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>;
-template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>;
-}  // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
deleted file mode 100644
index bddae51..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
+++ /dev/null

@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "impl_base.hpp"
-
-// TODO Move to common utilities somewhere
-template <size_t Size> struct DType { };
-template <> struct DType<1> { using scalar_type = uint8_t; };
-template <> struct DType<2> { using scalar_type = uint16_t; };
-template <> struct DType<4> { using scalar_type = uint32_t; };
-
-namespace depthwise
-{
-
-template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
-void PackParameters<KernelRows, KernelColumns, WeightSize, BiasSize>::execute(
-  unsigned int n_channels,
-  void *buffer,
-  const void *weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void *biases
-)
-{
-  using TWeight = typename DType<WeightSize>::scalar_type;
-  using TBias = typename DType<BiasSize>::scalar_type;
-
-  auto buffer_ptr = static_cast<uint8_t *>(buffer);
-  auto weights_ptr = static_cast<const TWeight *>(weights);
-  auto biases_ptr = static_cast<const TBias *>(biases);
-
-  const unsigned int veclen = 16 / WeightSize;
-  for (; n_channels >= veclen; n_channels -= veclen)
-  {
-    // Copy biases
-    for (unsigned int i = 0; i < veclen; i++)
-    {
-      auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
-      *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
-      buffer_ptr += BiasSize;
-    }
-
-    // Copy weights
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelColumns; j++)
-      {
-        for (unsigned int c = 0; c < veclen; c++)
-        {
-          *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride + c];
-          buffer_ptr += WeightSize;
-        }
-      }
-    }
-    weights_ptr += veclen;
-  }
-  for (; n_channels; n_channels--)
-  {
-    // Copy bias
-    auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
-    *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
-    buffer_ptr += BiasSize;
-
-    // Copy weights
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelColumns; j++)
-      {
-        *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride];
-        buffer_ptr += WeightSize;
-      }
-    }
-    weights_ptr++;
-  }
-}
-
-template struct PackParameters<3, 3, 2ul, 2ul>;
-template struct PackParameters<3, 3, 4ul, 4ul>;
-template struct PackParameters<5, 5, 2ul, 2ul>;
-template struct PackParameters<5, 5, 4ul, 4ul>;
-}  // namespace

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
deleted file mode 100644
index b09f620..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
+++ /dev/null

@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_qa8_qa8.hpp"
-
-namespace depthwise
-{
-template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 2, 2>;
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
deleted file mode 100644
index 1ae48b9..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
+++ /dev/null

@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_qa8_qs8_per_channel.hpp"
-
-namespace depthwise {
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>;
-template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>;
-} // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
deleted file mode 100644
index 4343f6a..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ /dev/null

@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise.hpp"
-#include "qasymm8.hpp"
-#include "qsymm8.hpp"
-#pragma once
-
-using namespace neon_convolution_kernels;
-using namespace qasymm8;
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
-{
-  return vqrdmulhq_s32(a, b);
-}
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
-{
-  return vqrdmulhq_n_s32(a, b);
-}
-
-inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
-{
-  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
-{
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
-{
-  const int32x4_t shift = vdupq_n_s32(-exponent);
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
-{
-  const int32x2_t shift = vdup_n_s32(-exponent);
-  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
-  const int32x2_t fixed = vqadd_s32(x, fixup);
-  return vrshl_s32(fixed, shift);
-}
-
-inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
-{
-  const int32x2_t xs = vdup_n_s32(x);
-  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
-}
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  uint8_t, int32_t, uint8_t,
-  QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    uint8_t, int32_t, uint8_t,
-    QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
-  >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    uint8_t _input_padding_value(void) const;
-
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      uint8_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-
-  private:
-    // Quantization parameters
-    const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
-    const qasymm8::QAsymm8RescaleParams rescale_parameters;
-};
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  uint8_t, int32_t, uint8_t,
-  QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    uint8_t, int32_t, uint8_t,
-    QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
-  >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-  QSymm8HybridPerChannelDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  QSymm8HybridPerChannelDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  size_t get_packed_params_size(void) const override
-  {
-      return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
-
-  }
-
-  protected:
-    uint8_t _input_padding_value(void) const;
-
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      uint8_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-
-  private:
-    // Quantization parameters
-    const qsymm8::QSymm8PerChannelParams _weights_quant;
-    const qasymm8::QAsymm8Params _input_quant, _output_quant;
-    const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
-};
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
deleted file mode 100644
index a11b098..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
+++ /dev/null

@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise_dilated.hpp"
-#include "depthwise_quantized.hpp"
-
-namespace depthwise {
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-class QAsymm8DilatedDepthwiseConvolution
-    : public DilatedDepthwiseConvolution<
-          OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-          StrideCols, uint8_t, int32_t, uint8_t> {
-public:
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      const qasymm8::QAsymm8RescaleParams &rescale_parameters,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-};
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
deleted file mode 100644
index 266d13d..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ /dev/null

@@ -1,505 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <algorithm>
-#include <cstdint>
-#include "depthwise.hpp"
-#include "padding.hpp"
-#include "utils.hpp"
-
-#pragma once
-
-#define MEMBERFN(TOUT) template <\
-  unsigned int OutputTileRows, unsigned int OutputTileColumns,\
-  unsigned int KernelRows, unsigned int KernelColumns,\
-  unsigned int StrideRows, unsigned int StrideColumns,\
-  typename TIn, typename TBias, typename TOut,\
-  typename Derived\
-> TOUT DepthwiseConvolutionBase<\
-  OutputTileRows, OutputTileColumns,\
-  KernelRows, KernelColumns,\
-  StrideRows, StrideColumns,\
-  TIn, TBias, TOut, Derived\
->
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
-struct PackParameters
-{
-  static void execute(
-    unsigned int n_channels,
-    void *buffer,
-    const void *weights,
-    unsigned int weight_row_stride,
-    unsigned int weight_col_stride,
-    const void *biases
-  );
-};
-
-const unsigned int CHANNEL_BLOCK = 16;
-
-MEMBERFN(int)::get_output_size(
-  const int dim_size, const unsigned int padding_before, const unsigned int padding_after
-)
-{
-  return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
-}
-
-MEMBERFN(int)::output_size(
-  const int dim_size, const unsigned int padding_before, const unsigned int padding_after
-) const
-{
-  return get_output_size(dim_size, padding_before, padding_after);
-}
-
-MEMBERFN()::DepthwiseConvolutionBase(
-  const int n_batches,
-  const int n_input_rows,
-  const int n_input_cols,
-  const int n_channels,
-  ActivationFunction activation,
-  const unsigned int padding_top,
-  const unsigned int padding_left,
-  const unsigned int padding_bottom,
-  const unsigned int padding_right
-) : DepthwiseConvolutionBase(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      get_output_size(n_input_rows, padding_top, padding_bottom),
-      get_output_size(n_input_cols, padding_left, padding_right),
-      activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-MEMBERFN()::DepthwiseConvolutionBase(
-  const int n_batches,
-  const int n_input_rows,
-  const int n_input_cols,
-  const int n_channels,
-  const int n_output_rows,
-  const int n_output_cols,
-  ActivationFunction activation,
-  const unsigned int padding_top,
-  const unsigned int padding_left,
-  const unsigned int padding_bottom,
-  const unsigned int padding_right
-) : _input(nullptr), _output(nullptr),
-    _packed_parameters(nullptr),
-    _working_space(nullptr),
-    _n_batches(n_batches),
-    _n_input_rows(n_input_rows),
-    _n_input_cols(n_input_cols),
-    _n_channels(n_channels),
-    _n_output_rows(n_output_rows),
-    _n_output_cols(n_output_cols),
-    _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)),
-    _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)),
-    _padding_top(padding_top),
-    _padding_left(padding_left),
-    _padding_bottom(padding_bottom),
-    _padding_right(padding_right),
-    _activation(activation),
-    _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
-    _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0)
-{
-}
-
-MEMBERFN(void)::set_input(const void* const inptr)
-{
-  set_input(inptr, _n_channels);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
-{
-  set_input(inptr, _n_input_cols * ld_col, ld_col);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
-{
-  set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
-}
-
-MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
-{
-  _input = static_cast<const TIn *>(inptr);
-  _input_batch_stride = ld_batch;
-  _input_row_stride = ld_row;
-  _input_col_stride = ld_col;
-}
-
-MEMBERFN(void)::set_output(void* const outptr)
-{
-  set_output(outptr, _n_channels);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
-{
-  set_output(outptr, _n_output_cols * ld_col, ld_col);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
-{
-  set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
-}
-
-MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
-{
-  _output = static_cast<TOut *>(outptr);
-  _output_batch_stride = ld_batch;
-  _output_row_stride = ld_row;
-  _output_col_stride = ld_col;
-}
-
-MEMBERFN(size_t)::get_packed_params_size(void) const
-{
-  return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
-}
-
-MEMBERFN(void)::set_packed_params_buffer(void *buffer)
-{
-  _packed_parameters = buffer;
-}
-
-MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
-{
-  static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
-}
-
-MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
-{
-  const unsigned int weight_col_stride = _n_channels;
-  const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
-  static_cast<const Derived *>(this)->pack_params(
-    buffer, weights, weight_row_stride, weight_col_stride, biases
-  );
-}
-
-MEMBERFN(void)::pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  static_cast<const Derived *>(this)->_pack_params(
-    buffer, weights, weight_row_stride, weight_col_stride, biases
-  );
-}
-
-MEMBERFN(void)::_pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  // Default implementation
-  PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
-    _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
-  );
-}
-
-MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
-{
-  return nthreads * (
-    _get_input_working_space_size() + _get_output_working_space_size()
-  );
-}
-
-MEMBERFN(void)::set_working_space(void *buffer)
-{
-  _working_space = buffer;
-}
-
-MEMBERFN(size_t)::_get_input_working_space_size(void) const
-{
-  return sizeof(TIn) * _n_channels;
-}
-
-MEMBERFN(size_t)::_get_output_working_space_size(void) const
-{
-  return sizeof(TOut) * _n_channels;
-}
-
-MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
-{
-  return static_cast<uint8_t*>(_working_space) + threadid * (
-    _get_input_working_space_size() + _get_output_working_space_size()
-  );
-}
-
-MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
-{
-  return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
-}
-
-MEMBERFN(unsigned int)::get_window() const
-{
-  // Parallelise over blocks of channels.
-  return iceildiv(_n_channels, CHANNEL_BLOCK);
-}
-
-MEMBERFN(void)::run(
-  const unsigned int start,
-  const unsigned int stop,
-  const unsigned int threadid
-)
-{
-  // Clear the input padding buffer
-  TIn *buf = static_cast<TIn *>(_get_input_working_space(threadid));
-  const TIn pad_value = static_cast<Derived *>(this)->_input_padding_value();
-  for (int n = 0; n < _n_channels; n++)
-  {
-    buf[n] = pad_value;
-  }
-
-  // Parallelise over blocks of channels
-  const auto start_channel = CHANNEL_BLOCK * start;
-  const auto stop_channel = std::min<unsigned int>(_n_channels, CHANNEL_BLOCK * stop);
-  const auto params_size_per_channel = this->get_packed_params_size()/_n_channels;
-
-  // Compute top and bottom padding for input and output
-  const int input_pad_top = _padding_top;
-  const int input_pad_left = _padding_left;
-  constexpr int tile_overlap = kernel_rows - stride_rows;
-
-  // Perform the convolution by calling `process_tile_row` for each tile row in
-  // each batch.
-  for (int batch = 0; batch < _n_batches; batch++)
-  {
-    const TIn* const inptr_batch = _input + batch*_input_batch_stride;
-    TOut* const outptr_batch = _output + batch*_output_batch_stride;
-
-    // Loop over rows of tiles
-    for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++)
-    {
-      // Pointer to the row
-      const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top;
-      const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride);
-      TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride;
-
-      // Input padding (top + bottom) for the row
-      const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top;
-      const int input_row_bottom = input_row_top + inner_tile_rows;
-      const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0;
-      const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows);
-
-      // Output padding (bottom) for the row
-      const int output_row_bottom = (tile_i + 1)*output_tile_rows;
-      const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
-
-      // Get the offset into the packed parameters
-      const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
-        start_channel*params_size_per_channel;
-
-      // Process the row
-      process_tile_row(
-        threadid,
-        stop_channel - start_channel,
-        params_ptr,
-        inptr_row + start_channel,
-        outptr_row + start_channel,
-        input_row_pad_top, input_pad_left, input_row_pad_bottom,
-        output_row_pad_bottom,
-        _n_tile_cols, _n_input_cols, _n_output_cols
-      );
-    }
-  }
-}
-
-MEMBERFN(void)::process_tile_row(
-  const unsigned int threadid,
-  const int n_channels,
-  const void* const packed_params,
-  const TIn* const inptr,
-  TOut* const outptr,
-  const int row_pad_in_top,
-  const int row_pad_in_left,
-  const int row_pad_in_bottom,
-  const int row_pad_out_bottom,
-  const int n_tiles,
-  const int n_input_cols,
-  const int n_output_cols
-)
-{
-  constexpr int tile_overlap = kernel_cols - stride_cols;
-
-  // Loop over columns of tiles
-  for (int tile_j = 0; tile_j < n_tiles; tile_j++)
-  {
-    // Input padding (left + right) for the tile
-    const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0;
-    const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left;
-    const int t_in_end = t_in_start + inner_tile_cols;
-    const int t_pad_in_right = std::max(0, t_in_end - n_input_cols);
-
-    // Output padding (right) for the tile
-    const int t_out_end = (tile_j + 1) * output_tile_cols;
-    const int t_pad_out_right = std::max(0, t_out_end - n_output_cols);
-
-    // Get pointers into the inputs and outputs
-    const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
-    const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
-    TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
-
-    // Process just this tile
-    process_tile(
-      threadid, n_channels, packed_params, inptr_col, outptr_col,
-      row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,  // Input paddings
-      row_pad_out_bottom, t_pad_out_right  // Output paddings
-    );
-  }
-}
-
-MEMBERFN(TIn)::_input_padding_value(void) const
-{
-  return static_cast<TIn>(0);
-}
-
-MEMBERFN(void)::process_tile(
-  const unsigned int threadid,
-  const int n_channels,
-  const void* const packed_params,
-  const TIn* const inptr,
-  TOut* const outptr,
-  const int pad_in_top,
-  const int pad_in_left,
-  const int pad_in_bottom,
-  const int pad_in_right,
-  const int pad_out_bottom,
-  const int pad_out_right
-)
-{
-  Derived * dthis = static_cast<Derived *>(this);
-  const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
-  const bool pad_output = pad_out_bottom || pad_out_right;
-
-  if (!pad_input && !pad_output)
-  {
-    switch(_activation)
-    {
-      case ActivationFunction::ReLU:
-        dthis->template execute_tile<ActivationFunction::ReLU>(
-          n_channels, packed_params,
-          inptr, _input_row_stride, _input_col_stride,
-          outptr, _output_row_stride, _output_col_stride
-        );
-        break;
-      case ActivationFunction::ReLU6:
-        dthis->template execute_tile<ActivationFunction::ReLU6>(
-          n_channels, packed_params,
-          inptr, _input_row_stride, _input_col_stride,
-          outptr, _output_row_stride, _output_col_stride
-        );
-        break;
-      default:
-        dthis->template execute_tile<ActivationFunction::None>(
-          n_channels, packed_params,
-          inptr, _input_row_stride, _input_col_stride,
-          outptr, _output_row_stride, _output_col_stride
-        );
-        break;
-    }
-  }
-  else
-  {
-    // Create arrays of input and output pointers, pointing padded elements to
-    // the working space padding buffers provided.
-    const TIn *inptrs[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        if (i < pad_in_top || (inner_tile_rows - pad_in_bottom) <= i ||
-            j < pad_in_left || (inner_tile_cols - pad_in_right) <= j)
-        {
-          // Padded input
-          inptrs[i][j] = static_cast<const TIn *>(_get_input_working_space(threadid));
-        }
-        else
-        {
-          inptrs[i][j] = inptr + (i - pad_in_top)*_input_row_stride + (j - pad_in_left)*_input_col_stride;
-        }
-      }
-    }
-
-    TOut *outptrs[output_tile_rows][output_tile_cols];
-    for (int i = 0; i < output_tile_rows; i++)
-    {
-      for (int j = 0; j < output_tile_cols; j++)
-      {
-        if (i < (output_tile_rows - pad_out_bottom) &&
-            j < (output_tile_cols - pad_out_right))
-        {
-          outptrs[i][j] = outptr + i*_output_row_stride + j*_output_col_stride;
-        }
-        else
-        {
-          outptrs[i][j] = static_cast<TOut *>(_get_output_working_space(threadid));
-        }
-      }
-    }
-
-    switch(_activation)
-    {
-      case ActivationFunction::ReLU:
-        dthis->template execute_tile<ActivationFunction::ReLU>(
-          n_channels, packed_params, inptrs, outptrs
-        );
-        break;
-      case ActivationFunction::ReLU6:
-        dthis->template execute_tile<ActivationFunction::ReLU6>(
-          n_channels, packed_params, inptrs, outptrs
-        );
-        break;
-      default:
-        dthis->template execute_tile<ActivationFunction::None>(
-          n_channels, packed_params, inptrs, outptrs
-        );
-        break;
-    }
-  }
-}
-
-MEMBERFN(int)::n_channels(void) const
-{
-  return _n_channels;
-}
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
deleted file mode 100644
index 4130188..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
+++ /dev/null

@@ -1,295 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "depthwise_dilated.hpp"
-#include "utils.hpp"
-
-#define MEMBERFN(TOUT)                                                         \
-  template <unsigned int OutputTileRows, unsigned int OutputTileColumns,       \
-            unsigned int KernelRows, unsigned int KernelColumns,               \
-            unsigned int StrideRows, unsigned int StrideColumns, typename TIn, \
-            typename TBias, typename TOut>                                     \
-  TOUT DilatedDepthwiseConvolution<OutputTileRows, OutputTileColumns,          \
-                                   KernelRows, KernelColumns, StrideRows,      \
-                                   StrideColumns, TIn, TBias, TOut>
-
-namespace depthwise {
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
-                              const int n_input_cols, const int n_channels,
-                              const int dilation_factor,
-                              nck::ActivationFunction activation,
-                              const unsigned int padding_top,
-                              const unsigned int padding_left,
-                              const unsigned int padding_bottom,
-                              const unsigned int padding_right)
-    : DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          DilatedDepthwiseConvolution::get_output_size(
-              n_input_rows, padding_top, padding_bottom, dilation_factor),
-          DilatedDepthwiseConvolution::get_output_size(
-              n_input_cols, padding_left, padding_right, dilation_factor),
-          activation, padding_top, padding_left, padding_bottom,
-          padding_right) {}
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows,
-                              const int n_input_cols, const int n_channels,
-                              const int dilation_factor,
-                              const int n_output_rows, const int n_output_cols,
-                              nck::ActivationFunction activation,
-                              const unsigned int padding_top,
-                              const unsigned int padding_left,
-                              const unsigned int, // padding_bottom
-                              const unsigned int  // padding_right
-                              )
-    : DilatedDepthwiseConvolution(
-          n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor,
-          n_output_rows, n_output_cols, activation, padding_top, padding_left,
-          0, 0,
-          // Function which creates a new (standard) depthwise convolution
-          [](const int n_batches, const int n_input_rows,
-             const int n_input_cols, const int n_channels,
-             const int n_output_rows, const int n_output_cols,
-             const nck::ActivationFunction activation,
-             const unsigned int padding_top, const unsigned int padding_left,
-             const unsigned int padding_bottom,
-             const unsigned int padding_right) -> IDepthwiseConvolution * {
-            return new DepthwiseConvolution<
-                OutputTileRows, OutputTileColumns, KernelRows, KernelColumns,
-                StrideRows, StrideColumns, TIn, TBias, TOut>(
-                n_batches, n_input_rows, n_input_cols, n_channels,
-                n_output_rows, n_output_cols, activation, padding_top,
-                padding_left, padding_bottom, padding_right);
-          }) {}
-
-MEMBERFN()
-::DilatedDepthwiseConvolution(
-    const int n_batches, const int n_input_rows, const int n_input_cols,
-    const int n_channels, const int dilation_factor, const int n_output_rows,
-    const int n_output_cols, nck::ActivationFunction activation,
-    const unsigned int padding_top, const unsigned int padding_left,
-    const unsigned int, // padding_bottom
-    const unsigned int, // padding_right
-    std::function<IDepthwiseConvolution *(
-        int, int, int, int, int, int, nck::ActivationFunction, unsigned int,
-        unsigned int, unsigned int, unsigned int)>
-        subconvfn // Function to create a new convolution
-    )
-    : _dilation_factor(dilation_factor), _n_input_rows(n_input_rows),
-      _n_input_cols(n_input_cols), _n_channels(n_channels),
-      _padding_top(static_cast<int>(padding_top)),
-      _padding_left(static_cast<int>(padding_left)),
-      _n_output_rows(n_output_rows), _n_output_cols(n_output_cols),
-      _convs(_dilation_factor) {
-  // Instantiate the base convolutions
-  for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
-    // Compute properties of this row of base convolutions
-    const int row_top =
-        i * StrideRows - _padding_top; // -ve values are in the padding
-    const int row_pad_top =
-        row_top < 0 ? iceildiv(-row_top, dilation_factor) : 0;
-
-    const int _n_input_rows = iceildiv(n_input_rows - i, dilation_factor);
-    const int _n_output_rows = iceildiv(n_output_rows - i, dilation_factor);
-
-    for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
-      // Compute properties of the base convolution
-      const int col_left =
-          j * StrideColumns - padding_left; // -ve values are in the padding
-      const int col_pad_left =
-          col_left < 0 ? iceildiv(-col_left, dilation_factor) : 0;
-
-      const int _n_input_cols = iceildiv(n_input_cols - j, dilation_factor);
-      const int _n_output_cols = iceildiv(n_output_cols - j, dilation_factor);
-
-      // Create new depthwise convolution engine and include it in the vector
-      // of engines. The new depthwise convolution engine is created by calling
-      // the delegate function we received as an argument.
-      _convs[i].emplace_back(subconvfn(
-          n_batches, _n_input_rows, _n_input_cols, n_channels, _n_output_rows,
-          _n_output_cols, activation,
-          // Note: since we have computed the output tensor size we don't need
-          // to explicitly provide bottom and right padding values to the
-          // depthwise convolution.
-          row_pad_top, col_pad_left, 0, 0));
-    }
-  }
-}
-
-MEMBERFN(void)::set_input(const void *const inptr) {
-  set_input(inptr, _n_channels);
-}
-
-MEMBERFN(void)::set_input(const void *const inptr, const int ldcol) {
-  set_input(inptr, _n_input_cols * ldcol, ldcol);
-}
-
-MEMBERFN(void)
-::set_input(const void *const inptr, const int ldrow, const int ldcol) {
-  set_input(inptr, _n_input_rows * ldrow, ldrow, ldcol);
-}
-
-MEMBERFN(void)
-::set_input(const void *const inptr, const int ldbatch, const int ldrow,
-            const int ldcol) {
-  // Compute dilated strides
-  const int ldrow_dilated = ldrow * _dilation_factor;
-  const int ldcol_dilated = ldcol * _dilation_factor;
-
-  // Pass input parameters on to base convolutions
-  for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
-    const int top_pos =
-        i * StrideRows - _padding_top +
-        ((static_cast<int>(i * StrideRows) < _padding_top)
-             ? iceildiv(_padding_top - i * StrideRows, _dilation_factor) *
-                   _dilation_factor
-             : 0);
-    const TIn *const inptr_i =
-        static_cast<const TIn *>(inptr) + top_pos * ldrow;
-
-    for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
-      int left_pos = j * StrideColumns - _padding_left;
-      while (left_pos < 0)
-        left_pos += _dilation_factor;
-
-      // Modify the pointer to point to the first element of the dilated input
-      // tensor, then set the input for this convolution engine.
-      const void *const inptr_ij = inptr_i + left_pos * ldcol;
-      _convs[i][j]->set_input(inptr_ij, ldbatch, ldrow_dilated, ldcol_dilated);
-    }
-  }
-}
-
-MEMBERFN(void)::set_output(void *const outptr) {
-  set_output(outptr, _n_channels);
-}
-
-MEMBERFN(void)::set_output(void *const outptr, const int ldcol) {
-  set_output(outptr, _n_output_cols * ldcol, ldcol);
-}
-
-MEMBERFN(void)
-::set_output(void *const outptr, const int ldrow, const int ldcol) {
-  set_output(outptr, _n_output_rows * ldrow, ldrow, ldcol);
-}
-
-MEMBERFN(void)
-::set_output(void *const outptr, const int ldbatch, const int ldrow,
-             const int ldcol) {
-  // Compute dilated strides
-  const int ldrow_dilated = ldrow * _dilation_factor;
-  const int ldcol_dilated = ldcol * _dilation_factor;
-
-  // Pass input parameters on to base convolutions
-  for (uint32_t i = 0; i < static_cast<uint32_t>(_dilation_factor); i++) {
-    for (uint32_t j = 0; j < static_cast<uint32_t>(_dilation_factor); j++) {
-      // Modify the pointer to point to the first element of the dilated input
-      // tensor, then set the input for this convolution engine.
-      void *const outptr_ij =
-          static_cast<TOut *>(outptr) + i * ldrow + j * ldcol;
-      _convs[i][j]->set_output(outptr_ij, ldbatch, ldrow_dilated,
-                               ldcol_dilated);
-    }
-  }
-}
-
-MEMBERFN(int)
-::get_output_size(const int dim_size, const unsigned int padding_before,
-                  const unsigned int padding_after, const int dilation_factor) {
-  const int input_size =
-      dim_size + static_cast<int>(padding_before + padding_after);
-  const int window_size = (KernelRows - 1) * dilation_factor + 1;
-  return iceildiv(input_size - window_size + 1, StrideRows);
-}
-
-MEMBERFN(int)
-::output_size(const int dim_size, const unsigned int padding_before,
-              const unsigned int padding_after) const {
-  return get_output_size(dim_size, padding_before, padding_after,
-                         _dilation_factor);
-}
-
-MEMBERFN(size_t)::get_packed_params_size(void) const {
-  return _convs[0][0]->get_packed_params_size();
-}
-
-MEMBERFN(void)::set_packed_params_buffer(void *buffer) {
-  // Set the buffer for all convolution engines
-  for (auto &&row : _convs) {
-    for (auto &&conv : row) {
-      conv->set_packed_params_buffer(buffer);
-    }
-  }
-}
-
-MEMBERFN(void)
-::pack_params(const void *const weights, const void *const biases) const {
-  _convs[0][0]->pack_params(weights, biases);
-}
-
-MEMBERFN(void)
-::pack_params(void *const buffer, const void *const weights,
-              const void *const biases) const {
-  _convs[0][0]->pack_params(buffer, weights, biases);
-}
-
-MEMBERFN(void)
-::pack_params(void *const buffer, const void *const weights,
-              const unsigned int ldrow, const unsigned int ldcol,
-              const void *const biases) const {
-  _convs[0][0]->pack_params(buffer, weights, ldrow, ldcol, biases);
-}
-
-MEMBERFN(size_t)::get_working_space_size(unsigned int nthreads) const {
-  return _convs[0][0]->get_working_space_size(nthreads);
-}
-
-MEMBERFN(void)::set_working_space(void *const ws) {
-  // Use the same working space set for all contained depthwise engines.
-  for (auto &&row : _convs) {
-    for (auto &&conv : row) {
-      conv->set_working_space(ws);
-    }
-  }
-}
-
-MEMBERFN(unsigned int)::get_window(void) const {
-  return _convs[0][0]->get_window();
-}
-
-MEMBERFN(void)
-::run(const unsigned int start, const unsigned int stop,
-      const unsigned int threadid) {
-  // Run each contained convolution in turn
-  for (auto &&row : _convs) {
-    for (auto &&conv : row) {
-      conv->run(start, stop, threadid);
-    }
-  }
-}
-
-} // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
deleted file mode 100644
index a00a1ef..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ /dev/null

@@ -1,439 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include "arm.hpp"
-#include "impl_base.hpp"
-
-#pragma once
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
->::DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  ActivationFunction activation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
->::DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  int n_output_rows, int n_output_cols,
-  ActivationFunction activation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      n_output_rows, n_output_cols, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
->::execute_tile(
-  int n_channels,
-  const void *weights_biases_ptr,
-  const float16_t *input,
-  const unsigned int in_row_stride,
-  const unsigned int in_col_stride,
-  float16_t *output,
-  const unsigned int out_row_stride,
-  const unsigned int out_col_stride
-)
-{
-  // Instantiate pointers
-  const float16_t* __restrict__ inptr_base = input;
-  float16_t* __restrict__ outptr_base = output;
-  const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  for (; channels_remaining >= 8; channels_remaining -= 8)
-  {
-    // Load input tile
-    float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      const float16_t* const inptr_row = inptr_base + i*in_row_stride;
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = vld1q_f16(inptr_row + j*in_col_stride);
-      }
-    }
-    inptr_base += 8;
-
-    // Load weights tile
-    float16x8_t vbias = vld1q_f16(params);
-    params += 8;
-
-    float16x8_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = vld1q_f16(params);
-        params += 8;
-      }
-    }
-
-    // Perform the convolution
-    float16x8_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        v[out_i][out_j] = vbias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const unsigned int j = base_j + in_j;
-
-            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-            v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      float16_t* const outptr_row = outptr_base + i*out_row_stride;
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
-      }
-    }
-    outptr_base += 8;
-  }
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load input tile
-    float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      const float16_t* const inptr_row = inptr_base + i*in_row_stride;
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = *(inptr_row + j*in_col_stride);
-      }
-    }
-    inptr_base++;
-
-    // Load weights tile
-    float16_t bias = *(params++);
-    float16_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = *(params++);
-      }
-    }
-
-    // Perform the convolution
-    float16_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = bias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      float16_t* const outptr_row = outptr_base + i*out_row_stride;
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        *(outptr_row + j*out_col_stride) = v[i][j];
-      }
-    }
-    outptr_base++;
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
->::execute_tile(
-  int n_channels,
-  const void *weights_biases_ptr,
-  const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  // Instantiate pointers
-  const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
-  int n = 0;
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
-  {
-    // Load input tile
-    float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = vld1q_f16(inptrs[i][j] + n);
-      }
-    }
-
-    // Load weights tile
-    float16x8_t vbias = vld1q_f16(params);
-    params += 8;
-
-    float16x8_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = vld1q_f16(params);
-        params += 8;
-      }
-    }
-
-    // Perform the convolution
-    float16x8_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        v[out_i][out_j] = vbias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const unsigned int j = base_j + in_j;
-
-            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-            v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        vst1q_f16(outptrs[i][j] + n, v[i][j]);
-      }
-    }
-  }
-  for (; channels_remaining; channels_remaining--, n++)
-  {
-    // Load input tile
-    float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = *(inptrs[i][j] + n);
-      }
-    }
-
-    // Load weights tile
-    float16_t bias = *(params++);
-    float16_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = *(params++);
-      }
-    }
-
-    // Perform the convolution
-    float16_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = bias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        *(outptrs[i][j] + n) = v[i][j];
-      }
-    }
-  }
-}
-
-}  // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
deleted file mode 100644
index b0d8126..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ /dev/null

@@ -1,438 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-
-#pragma once
-
-using namespace neon_convolution_kernels;
-
-namespace depthwise
-{
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float, float, float
->::DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  ActivationFunction activation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float, float, float
->::DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  int n_output_rows, int n_output_cols,
-  ActivationFunction activation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      n_output_rows, n_output_cols, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-    )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float, float, float
->::execute_tile(
-  int n_channels,
-  const void *weights_biases_ptr,
-  const float *input,
-  const unsigned int in_row_stride,
-  const unsigned int in_col_stride,
-  float *output,
-  const unsigned int out_row_stride,
-  const unsigned int out_col_stride
-)
-{
-  // Instantiate pointers
-  const float* __restrict__ inptr_base = input;
-  float* __restrict__ outptr_base = output;
-  const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Load input tile
-    float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      const float* const inptr_row = inptr_base + i*in_row_stride;
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = vld1q_f32(inptr_row + j*in_col_stride);
-      }
-    }
-    inptr_base += 4;
-
-    // Load weights tile
-    float32x4_t vbias = vld1q_f32(params);
-    params += 4;
-
-    float32x4_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = vld1q_f32(params);
-        params += 4;
-      }
-    }
-
-    // Perform the convolution
-    float32x4_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        v[out_i][out_j] = vbias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const unsigned int j = base_j + in_j;
-
-            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-            v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      float* const outptr_row = outptr_base + i*out_row_stride;
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
-      }
-    }
-    outptr_base += 4;
-  }
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load input tile
-    float u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      const float* const inptr_row = inptr_base + i*in_row_stride;
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = *(inptr_row + j*in_col_stride);
-      }
-    }
-    inptr_base++;
-
-    // Load weights tile
-    float bias = *(params++);
-    float w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = *(params++);
-      }
-    }
-
-    // Perform the convolution
-    float v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = bias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      float* const outptr_row = outptr_base + i*out_row_stride;
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        *(outptr_row + j*out_col_stride) = v[i][j];
-      }
-    }
-    outptr_base++;
-  }
-}
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols, StrideRows, StrideCols,
-  float, float, float
->::execute_tile(
-  int n_channels,
-  const void *weights_biases_ptr,
-  const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
-)
-{
-  const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  int n = 0;
-  for (; channels_remaining >= 4; channels_remaining -= 4, n += 4)
-  {
-    // Load input tile
-    float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = vld1q_f32(inptrs[i][j] + n);
-      }
-    }
-
-    // Load weights tile
-    float32x4_t vbias = vld1q_f32(params);
-    params += 4;
-
-    float32x4_t w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = vld1q_f32(params);
-        params += 4;
-      }
-    }
-
-    // Perform the convolution
-    float32x4_t v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        v[out_i][out_j] = vbias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const unsigned int j = base_j + in_j;
-
-            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-            v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        vst1q_f32(outptrs[i][j] + n, v[i][j]);
-      }
-    }
-  }
-  for (; channels_remaining; channels_remaining--, n++)
-  {
-    // Load input tile
-    float u[Base::inner_tile_rows][Base::inner_tile_cols];
-    for (int i = 0; i < Base::inner_tile_rows; i++)
-    {
-      for (int j = 0; j < Base::inner_tile_cols; j++)
-      {
-        u[i][j] = *(inptrs[i][j] + n);
-      }
-    }
-
-    // Load weights tile
-    float bias = *(params++);
-    float w[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        w[i][j] = *(params++);
-      }
-    }
-
-    // Perform the convolution
-    float v[OutputTileRows][OutputTileCols];
-    for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
-    {
-      for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = bias;
-
-        // Base co-ordinate
-        const int base_i = out_i * StrideRows;
-        const int base_j = out_j * StrideCols;
-
-        // Fill the accumulator
-        for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
-        {
-          const unsigned int i = base_i + in_i;
-          for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-
-        // Apply the activation function
-        if (Activation == ActivationFunction::ReLU ||
-            Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
-        }
-        if (Activation == ActivationFunction::ReLU6)
-        {
-          v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
-        }
-      }
-    }
-
-    // Store the output tile
-    for (unsigned int i = 0; i < OutputTileRows; i++)
-    {
-      for (unsigned int j = 0; j < OutputTileCols; j++)
-      {
-        *(outptrs[i][j] + n) = v[i][j];
-      }
-    }
-  }
-}
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
deleted file mode 100644
index e8b4c7b..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ /dev/null

@@ -1,511 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <limits>
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-#include "depthwise_quantized.hpp"
-
-namespace depthwise
-{
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QAsymm8Params& weight_quantisation,
-  const QAsymm8Params& input_quantisation,
-  const QAsymm8Params& output_quantisation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : QAsymm8DepthwiseConvolution(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    activation, weight_quantisation, input_quantisation, output_quantisation,
-    QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
-    padding_top, padding_left, padding_bottom, padding_right
-  )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  int n_output_rows, int n_output_cols,
-  const ActivationFunction activation,
-  const QAsymm8Params& weight_quantisation,
-  const QAsymm8Params& input_quantisation,
-  const QAsymm8Params& output_quantisation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : QAsymm8DepthwiseConvolution(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    n_output_rows, n_output_cols,
-    activation, weight_quantisation, input_quantisation, output_quantisation,
-    QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
-    padding_top, padding_left, padding_bottom, padding_right
-  )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QAsymm8Params& weight_quantisation,
-  const QAsymm8Params& input_quantisation,
-  const QAsymm8Params& output_quantisation,
-  const QAsymm8RescaleParams& rescale_params,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-    n_batches, n_input_rows, n_input_cols, n_channels, activation,
-    padding_top, padding_left, padding_bottom, padding_right
-  ),
-  _weights_quant(weight_quantisation),
-  _inputs_quant(input_quantisation),
-  _output_quant(output_quantisation),
-  rescale_parameters(rescale_params)
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QAsymm8DepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  int n_output_rows, int n_output_cols,
-  const ActivationFunction activation,
-  const QAsymm8Params& weight_quantisation,
-  const QAsymm8Params& input_quantisation,
-  const QAsymm8Params& output_quantisation,
-  const QAsymm8RescaleParams& rescale_params,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    n_output_rows, n_output_cols, activation,
-    padding_top, padding_left, padding_bottom, padding_right
-  ),
-  _weights_quant(weight_quantisation),
-  _inputs_quant(input_quantisation),
-  _output_quant(output_quantisation),
-  rescale_parameters(rescale_params)
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-uint8_t QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_input_padding_value(void) const
-{
-  return _inputs_quant.offset;
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-void QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  const uint8_t *wptr = static_cast<const uint8_t *>(weights);
-  const int32_t *bptr = static_cast<const int32_t *>(biases);
-  uint8_t *outptr = static_cast<uint8_t *>(buffer);
-
-  // We set the vector length to use doubles on both Aarch64 and Aarch32.  NOTE
-  // For SVE set this to half the vector length.
-  unsigned int veclen = 8;
-
-  // While there are channels left to process, pack a vector length of them at
-  // a time and reduce the size of vector used as the size of the tensor
-  // decreases.
-  for (
-    unsigned int n_channels = this->n_channels(); n_channels;
-    n_channels -= veclen,
-    outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
-  )
-  {
-    // NOTE Ignore this section if using SVE, the vector length remains the
-    // same and we just don't fill a full register for the tail.
-    while (n_channels < veclen)
-    {
-      // Reduce the vector length to either 8 or 1 (scalar)
-      // TODO Support more vector lengths in `execute_tile`.
-      veclen = (veclen == 16) ? 8 : 1;
-    }
-
-    // Get pointers to bias and weight portions of the output structure.
-    int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
-    uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
-
-    // Copy a vector length of elements
-    for (unsigned int n = 0; n < veclen && n < n_channels; n++)
-    {
-      const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
-      out_bptr[n] = bias;
-
-      for (unsigned int i = 0; i < KernelRows; i++)
-      {
-        uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
-        for (unsigned int j = 0; j < KernelCols; j++)
-        {
-          uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
-          row_outptr[j*veclen + n] = w;
-        }
-      }
-      wptr++;
-    }
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void tilefn(
-  int n_channels,
-  const void* packed_params,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr,
-  const int32_t clamp_max,
-  const int32_t clamp_min,
-  const uint8_t input_offset,
-  const uint8_t weight_offset,
-  const uint8_t output_offset,
-  const int32_t requant_multiplier,
-  const int32_t requant_shift
-)
-{
-  constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
-  constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
-  // Offset into channels
-  int channel = 0;
-
-  // Byte type pointer to weights and biases
-  const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
-
-  for (; n_channels >= 8; n_channels -= 8, channel += 8)
-  {
-    const int32x4_t biases[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
-    };
-    wbptr += 8*sizeof(int32_t);
-
-    int16x8_t weights[KernelRows][KernelCols];
-    const uint8x8_t woffset = vdup_n_u8(weight_offset);
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        const uint8x8_t w = vld1_u8(wbptr);
-        weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
-        wbptr += 8;
-      }
-    }
-
-    int16x8_t inputs[InnerTileRows][InnerTileCols];
-    const uint8x8_t ioffset = vdup_n_u8(input_offset);
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        const auto x = vld1_u8(get_input_ptr(i, j, channel));
-        inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
-      }
-    }
-
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32x4_t acc_a = biases[0], acc_b = biases[1];
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj];
-            const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
-#ifndef __aarch64__
-            acc_a = vmlal_s16(acc_a, vget_low_s16(w), vget_low_s16(x));
-            acc_b = vmlal_s16(acc_b, vget_high_s16(w), vget_high_s16(x));
-#else
-            asm("smlal  %[acc_a].4s, %[w].4h, %[x].4h\n"
-                "smlal2 %[acc_b].4s, %[w].8h, %[x].8h\n"
-                : [acc_a] "+w"(acc_a), [acc_b] "+w"(acc_b)
-                : [w] "w"(w), [x] "w"(x));
-#endif // __aarch64__
-          }
-        }
-
-        int32x4_t final_accs[2];
-        for (unsigned int i = 0; i < 2; i++)
-        {
-          const int32x4_t y = rounding_divide_by_exp2(
-              saturating_doubling_high_mul((i == 0 ? acc_a : acc_b), requant_multiplier),
-              requant_shift);
-          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
-          final_accs[i] = vaddq_s32(y, offset);
-          final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
-          final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
-        }
-
-#ifndef __aarch64__
-        const int16x8x2_t zelems = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
-                                             vreinterpretq_s16_s32(final_accs[1]));
-        const int8x16_t elems = vreinterpretq_s8_s16(zelems.val[0]);
-
-        const int8x16x2_t zoutput = vuzpq_s8(elems, elems);
-        const uint8x8_t output =
-                vget_low_u8(vreinterpretq_u8_s8(zoutput.val[0]));
-        vst1_u8(get_output_ptr(oi, oj, channel), output);
-#else
-        const int8x16_t elems = vreinterpretq_s8_s16(
-            vuzp1q_s16(vreinterpretq_s16_s32(final_accs[0]),
-                       vreinterpretq_s16_s32(final_accs[1])));
-        const uint8x8_t output =
-            vget_low_u8(vreinterpretq_u8_s8(vuzp1q_s8(elems, elems)));
-        vst1_u8(get_output_ptr(oi, oj, channel), output);
-#endif // __aarch64__
-      }
-    }
-  }
-  for (; n_channels; n_channels--, channel++)
-  {
-    // Load bias
-    const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
-    wbptr += sizeof(int32_t);
-
-    // Load weights
-    int16_t weights[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        weights[i][j] = *(wbptr++) - weight_offset;
-      }
-    }
-
-    // Load the input activations
-    int16_t inputs[InnerTileRows][InnerTileCols];
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
-      }
-    }
-
-    // Perform the convolution
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32_t acc = bias;
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
-            acc += w * x;
-          }
-        }
-
-        // Requantize
-        acc = rounding_divide_by_exp2(
-            saturating_doubling_high_mul(acc, requant_multiplier),
-            requant_shift);
-        acc += output_offset;
-        acc = std::max(acc, clamp_min);
-        acc = std::min(acc, clamp_max);
-        uint8_t output = static_cast<uint8_t>(acc);
-        *(get_output_ptr(oi, oj, channel)) = output;
-      }
-    }
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void execute_tilefn(
-  int n_channels,
-  const void* packed_params,
-  const nck::ActivationFunction actfn,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr,
-  const QAsymm8Params &input_quant,
-  const QAsymm8Params &weight_quant,
-  const QAsymm8Params &output_quant,
-  const QAsymm8RescaleParams &requant
-) {
-  // Compute min/max clamp values
-  int32_t clamp_min = std::numeric_limits<uint8_t>::min();
-  int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
-  if (actfn == nck::ActivationFunction::ReLU ||
-      actfn == nck::ActivationFunction::ReLU6) {
-    const int32_t bottom_rail = output_quant.offset;
-    clamp_min = std::max(clamp_min, bottom_rail);
-  }
-
-  if (actfn == nck::ActivationFunction::ReLU6) {
-    const int32_t top_rail = output_quant.quantize(6.0f);
-    clamp_max = std::min(clamp_max, top_rail);
-  }
-
-  // Call the tile execution method
-  tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-         StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
-                     clamp_max, clamp_min, input_quant.offset,
-                     weight_quant.offset, output_quant.offset,
-                     requant.multiplier, requant.shift);
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  uint8_t* outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride
-) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
-      const int i, const int j, const int channel) {
-    return inptr + i * in_row_stride + j * in_col_stride + channel;
-  };
-
-  const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
-      const int i, const int j, const int channel) {
-    return outptr + i * out_row_stride + j * out_col_stride + channel;
-  };
-
-  execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
-      _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <nck::ActivationFunction Activation>
-void QAsymm8DepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptrs](const int i, const int j,
-                                      const int channel) {
-    return inptrs[i][j] + channel;
-  };
-
-  const auto get_output_ptr = [outptrs](const int i, const int j,
-                                        const int channel) {
-    return outptrs[i][j] + channel;
-  };
-
-  // Call the tile execution method
-  execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
-      _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
-}
-
-}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
deleted file mode 100644
index 68e20d9..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
+++ /dev/null

@@ -1,457 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <limits>
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-#include "depthwise_quantized.hpp"
-
-#pragma once
-
-namespace {
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void tilefn_hybrid(
-  int n_channels,
-  const void* packed_params,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr,
-  int32_t clamp_min,
-  int32_t clamp_max,
-  uint8_t input_offset,
-  uint8_t output_offset
-)
-{
-  constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
-  constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
-  // Offset into channels
-  int channel = 0;
-
-  // Byte type pointer to weights and biases
-  const int8_t *wbptr = static_cast<const int8_t *>(packed_params);
-
-  for (; n_channels >= 8; n_channels -= 8, channel += 8)
-  {
-    const int32x4_t biases[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
-    };
-    const int32x4_t multipliers[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12),
-    };
-    const int32x4_t shifts[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20),
-    };
-    wbptr += 24*sizeof(int32_t);
-
-    int16x8_t weights[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        const auto w = vld1_s8(wbptr);
-        weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w));
-        wbptr += 8;
-      }
-    }
-
-    int16x8_t inputs[InnerTileRows][InnerTileCols];
-    const uint8x8_t ioffset = vdup_n_u8(input_offset);
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        const auto x = vld1_u8(get_input_ptr(i, j, channel));
-        inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
-      }
-    }
-
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32x4_t accs[2];
-        for (unsigned int i = 0; i < 2; i++)
-        {
-          accs[i] = biases[i];
-        }
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj];
-            const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
-            accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
-            accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
-          }
-        }
-
-        int32x4_t final_accs[2];
-        for (unsigned int i = 0; i < 2; i++)
-        {
-          const int32x4_t y = rounding_divide_by_exp2(
-              saturating_doubling_high_mul(accs[i], multipliers[i]),
-              shifts[i]);
-          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
-          final_accs[i] = vaddq_s32(y, offset);
-          final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
-          final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
-        }
-
-        const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
-                                         vreinterpretq_s16_s32(final_accs[1]));
-        const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
-        const uint8x8_t output =
-                    vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
-
-        vst1_u8(get_output_ptr(oi, oj, channel), output);
-      }
-    }
-  }
-
-  for (; n_channels; n_channels--, channel++)
-  {
-    // Load bias
-    const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
-    const int32_t multiplier = *reinterpret_cast<const int32_t *>(wbptr + sizeof(int32_t));
-    const int32_t shift = *reinterpret_cast<const int32_t *>(wbptr + 2*sizeof(int32_t));
-
-    wbptr += 3*sizeof(int32_t);
-
-    // Load weights
-    int16_t weights[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        weights[i][j] = *(wbptr++);
-      }
-    }
-
-    // Load the input activations
-    int16_t inputs[InnerTileRows][InnerTileCols];
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
-      }
-    }
-
-    // Perform the convolution
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32_t acc = bias;
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
-            acc += w * x;
-          }
-        }
-
-        // Requantize
-        acc = rounding_divide_by_exp2(
-            saturating_doubling_high_mul(acc, multiplier),
-            -shift);
-        acc += output_offset;
-        acc = std::max(acc, clamp_min);
-        acc = std::min(acc, clamp_max);
-        uint8_t output = static_cast<uint8_t>(acc);
-        *(get_output_ptr(oi, oj, channel)) = output;
-      }
-    }
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void execute_tilefn_hybrid(
-  int n_channels,
-  const void* packed_params,
-  const ActivationFunction actfn,
-  const qasymm8::QAsymm8Params &input_quant,
-  const qasymm8::QAsymm8Params &output_quant,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr) {
-
-  // Compute min/max clamp values
-  int32_t clamp_min = std::numeric_limits<uint8_t>::min();
-  int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
-  if (actfn == ActivationFunction::ReLU) {
-    clamp_min = output_quant.offset;
-  }
-
-  // Disabling Relu6 for now
-  if (actfn == ActivationFunction::ReLU6) {
-    const int32_t top_rail = output_quant.quantize(6.0f);
-    clamp_max = std::min(clamp_max, top_rail);
-  }
-
-  // Call the tile execution method
-  tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-         StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset);
-}
-}
-
-
-
-namespace depthwise {
-using namespace qsymm8;
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QSymm8PerChannelParams& weight_quantisation,
-  const qasymm8::QAsymm8Params& input_quantisation,
-  const qasymm8::QAsymm8Params& output_quantisation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : QSymm8HybridPerChannelDepthwiseConvolution(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    activation, weight_quantisation, input_quantisation, output_quantisation,
-    QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
-    padding_top, padding_left, padding_bottom, padding_right
-  )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QSymm8PerChannelParams& weight_quantisation,
-  const qasymm8::QAsymm8Params& input_quantisation,
-  const qasymm8::QAsymm8Params& output_quantisation,
-  const QSymm8PerChannelRescaleParams& rescale_params,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-  ),
-  _weights_quant(weight_quantisation),
-  _input_quant(input_quantisation),
-  _output_quant(output_quantisation),
-  _rescale_parameters(rescale_params)
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-uint8_t QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_input_padding_value(void) const
-{
-  return _input_quant.offset;
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  const int8_t *wptr = static_cast<const int8_t *>(weights);
-  const int32_t *bptr = static_cast<const int32_t *>(biases);
-  const int32_t *mptr = static_cast<const int32_t *>(_rescale_parameters.multipliers.data());
-  const int32_t *sptr = static_cast<const int32_t *>(_rescale_parameters.shifts.data());
-  int8_t *outptr = static_cast<int8_t *>(buffer);
-
-  // We set the vector length to use doubles on both Aarch64 and Aarch32.  NOTE
-  // For SVE set this to half the vector length.
-  unsigned int veclen = 8;
-
-  // While there are channels left to process, pack a vector length of them at
-  // a time and reduce the size of vector used as the size of the tensor
-  // decreases.
-  for (
-    unsigned int n_channels = this->n_channels(); n_channels;
-    n_channels -= veclen,
-    outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
-  )
-  {
-    // NOTE Ignore this section if using SVE, the vector length remains the
-    // same and we just don't fill a full register for the tail.
-    while (n_channels < veclen)
-    {
-      // Reduce the vector length to either 8 or 1 (scalar)
-      // TODO Support more vector lengths in `execute_tile`.
-      veclen = (veclen == 16) ? 8 : 1;
-    }
-
-    // Get pointers to bias and weight portions of the output structure.
-    int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
-    int32_t *out_mptr = reinterpret_cast<int32_t *>(outptr + veclen*sizeof(int32_t));
-    int32_t *out_sptr = reinterpret_cast<int32_t *>(outptr + 2*veclen*sizeof(int32_t));
-    int8_t  *out_wptr = outptr + 3*veclen*sizeof(int32_t);
-
-    // Copy a vector length of elements
-    for (unsigned int n = 0; n < veclen && n < n_channels; n++)
-    {
-      const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
-      const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0;
-      const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0;
-
-      out_bptr[n] = bias;
-      out_mptr[n] = multiplier;
-      out_sptr[n] = -shift;
-
-      for (unsigned int i = 0; i < KernelRows; i++)
-      {
-        int8_t *row_outptr = out_wptr + i*KernelCols*veclen;
-        for (unsigned int j = 0; j < KernelCols; j++)
-        {
-          int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
-          row_outptr[j*veclen + n] = w;
-        }
-      }
-      wptr++;
-    }
-  }
-}
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  uint8_t* outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride
-) {
-
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
-      const int i, const int j, const int channel) {
-    return inptr + i * in_row_stride + j * in_col_stride + channel;
-  };
-
-  const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
-      const int i, const int j, const int channel) {
-    return outptr + i * out_row_stride + j * out_col_stride + channel;
-  };
-
-  execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptrs](const int i, const int j,
-                                      const int channel) {
-    return inptrs[i][j] + channel;
-  };
-
-  const auto get_output_ptr = [outptrs](const int i, const int j,
-                                        const int channel) {
-    return outptrs[i][j] + channel;
-  };
-
-  // Call the tile execution method
-  execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation,  _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-} // namespace depthwise

diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index 4ddb35f..eac9baa 100644
--- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp

@@ -28,7 +28,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
 #include "src/core/NEON/wrapper/traits.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -98,6 +97,38 @@
     }
 };
 
+inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b)
+{
+    return vqrdmulhq_n_s32(a, b);
+}
+
+inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b)
+{
+    return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
+}
+
+inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent)
+{
+    const int32x4_t shift = vdupq_n_s32(-exponent);
+    const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
+    const int32x4_t fixed = vqaddq_s32(x, fixup);
+    return vrshlq_s32(fixed, shift);
+}
+
+inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent)
+{
+    const int32x2_t shift = vdup_n_s32(-exponent);
+    const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
+    const int32x2_t fixed = vqadd_s32(x, fixup);
+    return vrshl_s32(fixed, shift);
+}
+
+inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
+{
+    const int32x2_t xs = vdup_n_s32(x);
+    return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
+}
+
 inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
 {
     const int32_t current_h  = base_h + h * dilation.y();

diff --git a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
new file mode 100644
index 0000000..f5c63b7
--- /dev/null
+++ b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp

@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/AssemblyUtils.h"
+
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+
+#include "depthwise_common.hpp"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+constexpr unsigned int idx_width    = 1;
+constexpr unsigned int idx_height   = 2;
+constexpr unsigned int idx_channels = 0;
+constexpr unsigned int idx_batches  = 3;
+
+template <typename TSrc, typename TWeights, typename TDst>
+void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
+                    const ConvolutionInfo &info, const CPUInfo &cpu_info,
+                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel)
+{
+    unsigned int stride_cols{};
+    unsigned int stride_rows{};
+    std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
+
+    const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    const unsigned int kernel_cols = weights->dimension(idx_width);
+    const unsigned int kernel_rows = weights->dimension(idx_height);
+
+    const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
+
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
+                                            padding, activation, nullptr);
+
+    // Configure assembly pooling kernel
+    auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
+    if(dwc_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    kernel = std::move(dwc_kernel_asm);
+}
+
+template <typename TSrc, typename TWeights, typename TDst>
+void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
+                          const ConvolutionInfo &info, const CPUInfo &cpu_info,
+                          std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
+                          std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts)
+{
+    unsigned int stride_cols{};
+    unsigned int stride_rows{};
+    std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
+
+    const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
+
+    const unsigned int n_batches  = src->dimension(idx_batches);
+    const unsigned int src_rows   = src->dimension(idx_height);
+    const unsigned int src_cols   = src->dimension(idx_width);
+    const unsigned int n_channels = src->dimension(idx_channels);
+    const unsigned int dst_rows   = dst->dimension(idx_height);
+    const unsigned int dst_cols   = dst->dimension(idx_width);
+
+    const unsigned int kernel_cols = weights->dimension(idx_width);
+    const unsigned int kernel_rows = weights->dimension(idx_height);
+
+    const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
+
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
+                                            padding, activation, nullptr);
+
+    const auto src_qinfo     = src->quantization_info().uniform();
+    const auto weights_qinfo = weights->quantization_info();
+    const auto dst_qinfo     = dst->quantization_info().uniform();
+
+    const unsigned int num_filters = weights_qinfo.scale().size();
+
+    multipliers.resize(num_filters);
+    std::vector<int32_t> dst_shifts(num_filters);
+    quantization::compute_quantized_multipliers_and_shifts(src,
+                                                           weights,
+                                                           dst,
+                                                           multipliers.data(),
+                                                           dst_shifts.data());
+
+    // Quantize activation bounds
+    int32_t min_activation = std::numeric_limits<TSrc>::lowest();
+    int32_t max_activation = std::numeric_limits<TSrc>::max();
+    if(info.act_info.enabled())
+    {
+        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
+    }
+
+    // Set quantization parameters for assembly kernels
+    arm_gemm::Requantize32 requant_args{};
+    if(is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        left_shifts.resize(num_filters);
+        right_shifts.resize(num_filters);
+        bool need_left_shift = false; // Select more optimized path if left shift is not needed
+        for(unsigned int i = 0; i < num_filters; ++i)
+        {
+            left_shifts[i]  = std::max(-dst_shifts[i], static_cast<int32_t>(0));
+            right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
+            if(dst_shifts[i] < 0 && !need_left_shift)
+            {
+                need_left_shift = true;
+            }
+        }
+
+        requant_args = arm_gemm::Requantize32(nullptr,
+                                              0,
+                                              src_qinfo.offset,
+                                              weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset,
+                                              (need_left_shift) ? left_shifts.data() : nullptr,
+                                              right_shifts.data(),
+                                              multipliers.data(),
+                                              static_cast<TSrc>(min_activation),
+                                              static_cast<TSrc>(max_activation));
+    }
+    else
+    {
+        requant_args = arm_gemm::Requantize32(nullptr,
+                                              0,
+                                              src_qinfo.offset,
+                                              weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset,
+                                              -dst_shifts[0],
+                                              multipliers[0],
+                                              static_cast<TSrc>(min_activation),
+                                              static_cast<TSrc>(max_activation));
+    }
+
+    // Configure assembly pooling kernel with requantization
+    auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
+    if(dwc_kernel_asm == nullptr)
+    {
+        // Configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    kernel = std::move(dwc_kernel_asm);
+}
+} // namespace
+
+CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
+    : _kernel_asm(nullptr),
+      _multipliers(),
+      _left_shifts(),
+      _right_shifts()
+{
+}
+
+CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
+
+void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst,
+                                                        const ConvolutionInfo &info, const CPUInfo &cpu_info)
+{
+    ARM_COMPUTE_UNUSED(cpu_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+    // Destination initialization if not yet initialized
+    const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
+
+#if defined(__aarch64__)
+    switch(src->data_type())
+    {
+        case DataType::QASYMM8:
+            if(is_data_type_quantized_per_channel(weights->data_type()))
+            {
+                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
+            }
+            else
+            {
+                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
+            }
+            break;
+        case DataType::QASYMM8_SIGNED:
+            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
+            break;
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+        case DataType::F16:
+            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm);
+            break;
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+        case DataType::F32:
+            create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm);
+            break;
+        default:
+            break;
+    }
+#endif // defined(__aarch64__)
+
+    Window win = calculate_max_window(*dst, Steps());
+    ICpuKernel::configure(win);
+}
+
+Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+#if !defined(__aarch64__)
+    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
+#endif // !defined(__aarch64__)
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)");
+
+    if(is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
+
+        if(is_data_type_quantized(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        }
+    }
+
+    if(dst->total_size() > 0)
+    {
+        const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+    return Status{};
+}
+
+void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
+    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
+    ITensor       *storage   = tensors.get_tensor(TensorType::ACL_INT_1);
+
+    const auto src_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    auto       dst_ptr        = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    auto       working_space  = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+    auto       parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
+
+    const auto src_shape   = src->info()->tensor_shape();
+    const auto dst_shape   = dst->info()->tensor_shape();
+    const auto src_padding = src->info()->padding();
+    const auto dst_padding = dst->info()->padding();
+
+    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
+    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
+    const size_t ld_src_batch = ld_src_row * src_shape[2];
+    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
+    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
+    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
+
+    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
+                         parameters_ptr,
+                         dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+                         working_space, info.thread_id, info.num_threads);
+}
+
+void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
+{
+    _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
+}
+
+size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const
+{
+    return _kernel_asm->get_storage_size();
+}
+
+size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
+{
+    return _kernel_asm->get_working_size(num_threads, num_input_channels);
+}
+
+bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const
+{
+    return _kernel_asm != nullptr;
+}
+
+const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const
+{
+    return "CpuDepthwiseConv2dAssemblyWrapperKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute

diff --git a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
new file mode 100644
index 0000000..8ff4444
--- /dev/null
+++ b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h

@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+// Forward declarations
+class IDepthwiseCommon;
+} // depthwise
+} // arm_conv
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** This class is a wrapper for the depthwise convolution assembly kernels.  */
+class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConv2dAssemblyWrapperKernel();
+    ~CpuDepthwiseConv2dAssemblyWrapperKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyWrapperKernel);
+
+    /** Initialise the kernel's src and dst.
+     *
+     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights  Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                      Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  bias     Bias tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                      Data type supported: same as @p src, S32 when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst      Destination tensor info. Data type supported: same as @p input.
+     * @param[in]  info     Depthwise convolution layer meta-data.
+     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
+     */
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info);
+
+    /** Indicates whether or not this function can be used to process the given parameters.
+     *
+     * Similar to @ref CpuDepthwiseConv2dAssemblyWrapperKernel::configure()
+     *
+     * @return a status.
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+    /** Pack bias and weights in a storage space for the assembly kernel
+     *
+     * @param[in] parameters_ptr Pointer to storage space.
+     * @param[in] bias_ptr       Pointer to bias buffer.
+     * @param[in] weights_ptr    Pointer to weights buffer.
+     * @param[in] ld_weights_col Columns displacement for the weights tensor.
+     * @param[in] ld_weights_row Rows displacement for the weights tensor.
+     */
+    void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
+
+    /** Get the amount of storage space required for the rearranged weights and bias.
+     *
+     * @return size of workspace
+     */
+    size_t get_storage_size() const;
+
+    /** Get size of the workspace needed by the assembly kernel.
+     *
+     * @param[in] num_threads        Maximum number of threads that are going to be spawned.
+     * @param[in] num_input_channels Number of channels of the input tensor.
+     *
+     * @return size of workspace
+     */
+    size_t get_working_size(unsigned int num_threads, unsigned int num_input_channels) const;
+
+    /** Was the asm kernel successfully configured?
+     *
+     * @return True if the asm kernel is configured and ready to run
+     */
+    bool is_configured() const;
+
+private:
+    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> _kernel_asm;
+    std::vector<int32_t>                                   _multipliers{};
+    std::vector<int32_t>                                   _left_shifts{};
+    std::vector<int32_t>                                   _right_shifts{};
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H */

diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index c78ffb9..89dd27a 100644
--- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp

@@ -43,11 +43,13 @@
 
 void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
+    ARM_COMPUTE_UNUSED(cpu_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // dst initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
 
+#if defined(__aarch64__)
     const bool requantize = src->quantization_info() != dst->quantization_info();
 
     switch(src->data_type())
@@ -83,6 +85,7 @@
         default:
             break;
     }
+#endif // defined(__aarch64__)
 
     Window win = calculate_max_window(*dst, Steps());
     INEKernel::configure(win);
@@ -192,7 +195,7 @@
     arm_conv::pooling::PoolingStride stride{};
     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
 
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+    const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
 
     constexpr unsigned int idx_width    = 1;
     constexpr unsigned int idx_height   = 2;
@@ -231,7 +234,7 @@
     arm_conv::pooling::PoolingStride stride{};
     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
 
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+    const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
 
     constexpr unsigned int idx_width    = 1;
     constexpr unsigned int idx_height   = 2;

diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp
new file mode 100644
index 0000000..1e8a2a5
--- /dev/null
+++ b/src/core/utils/AssemblyUtils.cpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/utils/AssemblyUtils.h"
+
+namespace arm_compute
+{
+namespace assembly_utils
+{
+arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
+{
+    arm_gemm::Activation gemm_act;
+
+    // Early exit in case lower bound is other than 0, as it's not yet supported
+    if(act.b() != 0.f)
+    {
+        return gemm_act;
+    }
+
+    switch(act.activation())
+    {
+        case ActivationLayerInfo::ActivationFunction::RELU:
+            gemm_act.type = arm_gemm::Activation::Type::ReLU;
+            break;
+        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
+            gemm_act.param1 = act.a();
+            gemm_act.param2 = 0.f;
+            break;
+        case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
+            gemm_act.param1 = act.a();
+            gemm_act.param2 = act.b();
+            break;
+        default:
+            gemm_act.type = arm_gemm::Activation::Type::None;
+    }
+
+    return gemm_act;
+}
+
+arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
+{
+    return arm_conv::PaddingValues{ pad_stride_info.pad_left(),
+                                    pad_stride_info.pad_top(),
+                                    pad_stride_info.pad_right(),
+                                    pad_stride_info.pad_bottom() };
+}
+} // namespace assembly_utils
+} // namespace arm_compute

diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h
new file mode 100644
index 0000000..e682973
--- /dev/null
+++ b/src/core/utils/AssemblyUtils.h

@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef UTILS_CORE_ASSEMBLY_UTILS_H
+#define UTILS_CORE_ASSEMBLY_UTILS_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/assembly/common.hpp"
+#include "src/core/cpu/kernels/assembly/arm_gemm.hpp"
+
+namespace arm_compute
+{
+namespace assembly_utils
+{
+/** Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure.
+ *
+ * @param[in] act Compute Library activation info.
+ *
+ * @return Assembly activation info.
+ */
+arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act);
+
+/** Performs a mapping between Compute Library PadStrideInfo and the assembly PaddingValues structure.
+ *
+ * @param[in] pad_stride_info Compute Library padding and strides info.
+ *
+ * @return Assembly padding values.
+ */
+arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info);
+} // namespace assembly
+} // namespace arm_compute
+#endif /* UTILS_CORE_ASSEMBLY_UTILS_H */

diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 49e39f6..81e813c 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -185,16 +185,15 @@
 void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
                                               const ITensorInfo *weights,
                                               const ITensorInfo *output,
-                                              unsigned int       idx_ofms,
                                               int32_t           *output_multipliers_ptr,
                                               int32_t           *output_shifts_ptr)
 {
-    const unsigned int num_filters = is_data_type_quantized_per_channel(weights->data_type()) ? weights->dimension(idx_ofms) : 1;
-
     const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
     const QuantizationInfo        wq_info = weights->quantization_info();
     const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
 
+    const unsigned int num_filters = wq_info.scale().size();
+
     for(unsigned int i = 0; i < num_filters; ++i)
     {
         int32_t     output_multiplier = 0;

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 6467caf..c7520cd 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -307,11 +307,9 @@
         {
             _output_multipliers.map();
             _output_shifts.map();
-            const unsigned int idx_ofms = _needs_permute ? 2 : 0;
             quantization::compute_quantized_multipliers_and_shifts(_input->info(),
                                                                    _original_weights->info(),
                                                                    _output->info(),
-                                                                   idx_ofms,
                                                                    reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
                                                                    reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
             _output_multipliers.unmap();
@@ -513,11 +511,9 @@
         {
             _output_multipliers.map();
             _output_shifts.map();
-            const unsigned int idx_ofms = _is_nhwc ? 0 : 2;
             quantization::compute_quantized_multipliers_and_shifts(_input->info(),
                                                                    _original_weights->info(),
                                                                    _output->info(),
-                                                                   idx_ofms,
                                                                    reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
                                                                    reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
             _output_multipliers.unmap();

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 3184d5d..188f3b8 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -352,7 +352,6 @@
         quantization::compute_quantized_multipliers_and_shifts(input->info(),
                                                                weights->info(),
                                                                output->info(),
-                                                               idx_kernels,
                                                                gemmlowp_output_stage.gemmlowp_multipliers.data(),
                                                                gemmlowp_output_stage.gemmlowp_shifts.data());
         gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
@@ -562,7 +561,6 @@
         quantization::compute_quantized_multipliers_and_shifts(input,
                                                                weights,
                                                                output,
-                                                               idx_kernels,
                                                                gemmlowp_output_stage.gemmlowp_multipliers.data(),
                                                                gemmlowp_output_stage.gemmlowp_shifts.data());
         gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a561b88..daa5fd5 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -137,9 +137,10 @@
 
     // Allocate memory based on the internal memory requirements
     experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
-    _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size }, 1, DataType::S8), mem_req[0].alignment);
-    _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size }, 1, DataType::S8), mem_req[1].alignment);
-
+    _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
+    _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
+    _memory_group.manage(&_impl->workspace);
+    _memory_group.manage(&_impl->packed_weights);
     _impl->workspace.allocator()->allocate();
     _impl->packed_weights.allocator()->allocate();
 }

diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
index 160a9fd..f577e94 100644
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp

@@ -62,8 +62,8 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
 
-    //Validate Activation Layer
-    if(info.act_info.enabled())
+    // Validate Activation Layer
+    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
@@ -95,15 +95,7 @@
     _is_prepared  = false;
 
     // Configure pipeline
-    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
-    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(info.act_info);
-    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(info.act_info);
-    _is_activationlayer_enabled         = info.act_info.enabled() && !(is_relu || is_relu6);
-
-    if(!_is_activationlayer_enabled)
-    {
-        act_info_to_use = info.act_info;
-    }
+    _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
 
     _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
     if(_is_nchw)
@@ -359,7 +351,7 @@
     }
 
     // Validate Activation Layer
-    if(info.act_info.enabled())
+    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }

diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
index 049397f..ae9f894 100644
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h

@@ -92,9 +92,8 @@
     *
     * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
     * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present
-    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
-    * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of dst is required
-    * -# @ref NEActivationLayer if fused activation is required
+    * -# @ref CpuDepthwiseConv2dAssemblyDispatch if assembly kernel implementation is present
+    * -# @ref CpuActivation if fused activation is required
     *
     */
     class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator

diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index a36ee1d..660ac01 100644
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp

@@ -24,315 +24,22 @@
 
 #include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
 
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
-#include "src/core/helpers/AutoConfiguration.h"
-
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <set>
+#include "src/core/CPP/Validate.h"
+#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/AssemblyUtils.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-namespace
-{
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
-                                                                        int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                        int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                        const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                        const qasymm8::QAsymm8RescaleParams &rescale_params,
-                                                                        int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
-                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                                  neon_convolution_kernels::ActivationFunction activation,
-                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
-                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *src,
-                                                                   const ITensorInfo     *weights,
-                                                                   ITensorInfo           *output,
-                                                                   const ConvolutionInfo &info)
-{
-    const DataType    data_type = src->data_type();
-    const TensorShape shape     = src->tensor_shape();
-
-    const int n_batches       = shape[3];
-    const int in_rows         = shape.z();
-    const int in_cols         = shape.y();
-    const int n_channels      = shape.x();
-    const int dilation_factor = info.dilation.x();
-    const int padding_top     = info.pad_stride_info.pad_top();
-    const int padding_left    = info.pad_stride_info.pad_left();
-    const int padding_bottom  = info.pad_stride_info.pad_bottom();
-    const int padding_right   = info.pad_stride_info.pad_right();
-
-    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
-    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-
-    const unsigned int stride_x    = info.pad_stride_info.stride().first;
-    const unsigned int kernel_size = weights->tensor_shape().y();
-
-    // Map activation function
-    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
-    if(arm_compute::utils::info_helpers::is_relu(info.act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU;
-    }
-    else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
-    }
-
-    // Create quantized convolver
-    if(is_uniform_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = src->quantization_info().uniform();
-        const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
-        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
-        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int32_t     qmultiplier = 0;
-        int32_t     qshift      = 0;
-        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
-
-        return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
-                                     wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else if(is_perchannel_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = src->quantization_info().uniform();
-        const QuantizationInfo        weights_qinfo = weights->quantization_info();
-        const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
-        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        std::vector<float>   fmultipliers;
-        std::vector<int32_t> qmultipliers;
-        std::vector<int32_t> qshifts;
-
-        for(auto const s : wqinfo.scales)
-        {
-            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
-            int32_t     qmultiplier = 0;
-            int32_t     qshift      = 0;
-            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-            fmultipliers.push_back(fmultipler);
-            qmultipliers.push_back(qmultiplier);
-            qshifts.push_back(qshift);
-        }
-
-        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
-
-        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
-                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else
-    {
-        // Create float convolver
-        switch(data_type)
-        {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F32:
-            {
-                return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-            default:
-                return nullptr;
-        }
-    }
-}
-} // namespace
-
 struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
 {
-    std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
-    NEDepthwiseConvolutionAssemblyKernelWrapper       dwc_acl_kernel{};
-    bool                                              is_prepared{ false };
-    experimental::MemoryRequirements                  mem_req{};
+    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
+    bool                                                              is_prepared{ false };
+    experimental::MemoryRequirements                                  mem_req{};
 };
 
 #ifndef DOXYGEN_SKIP_THIS
@@ -350,40 +57,30 @@
                                                    ITensorInfo           *dst,
                                                    const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_UNUSED(bias);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dAssemblyDispatch::validate(src,
-                                                                            weights,
-                                                                            bias != nullptr ? bias : nullptr,
-                                                                            dst,
-                                                                            info));
+    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+    const unsigned int num_threads = NEScheduler::get().num_threads();
+    _pImpl->is_prepared            = false;
 
-    // Output auto inizialitation if not yet initialized
-    const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dst_shape).set_quantization_info(dst->quantization_info()));
+    // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+    if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
+    {
+        return;
+    }
 
-    _pImpl->is_prepared = false;
+    auto dwc_wrapper = std::make_unique<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel>();
+    ARM_COMPUTE_ERROR_ON(dwc_wrapper == nullptr);
+    dwc_wrapper->configure(src, weights, bias, dst, info, ci);
 
-    // Create convolver
-    _pImpl->dwc_assembly_kernel = create_convolver(src, weights, dst, info);
-    ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);
+    // Compute memory requirements for assembly kernels
+    constexpr size_t alignment = 4096;
+    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads, src->dimension(0)), alignment });
+    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment });
+    _pImpl->asm_kernel = std::move(dwc_wrapper);
+}
 
-    // Create assembly kernel wrapper
-    _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());
-
-    constexpr size_t alignment = 128;
-
-    // Create workspace
-    const unsigned int num_threads    = NEScheduler::get().num_threads();
-    const size_t       workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
-    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });
-
-    // Create packing tensor
-    const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
-    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
-
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+    return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
 }
 
 experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const
@@ -391,165 +88,40 @@
     return _pImpl->mem_req;
 }
 
-Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
-                                                    const ITensorInfo     *weights,
-                                                    const ITensorInfo     *bias,
-                                                    const ITensorInfo     *dst,
-                                                    const ConvolutionInfo &info)
+bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-
-    // Validate convolver
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(src, weights, info));
-
-    // Validate activation
-    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(info.act_info);
-    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));
-
-    // Check bias
-    if(bias != nullptr)
-    {
-        unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    // Check output
-    if(dst->total_size() != 0)
-    {
-        const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    // The uniform quantization case will only have 1 scale value in the weights quantization info
-    const UniformQuantizationInfo src_qinfo     = src->quantization_info().uniform();
-    const QuantizationInfo        weights_qinfo = weights->quantization_info();
-    const UniformQuantizationInfo dst_qinfo     = dst->quantization_info().uniform();
-    for(auto const s : weights_qinfo.scale())
-    {
-        const float fmultipler = src_qinfo.scale * s / dst_qinfo.scale;
-        ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
-    }
-
-    return Status{};
-}
-
-bool CpuDepthwiseConv2dAssemblyDispatch::is_optimized_supported(const ITensorInfo     *src,
-                                                                const ITensorInfo     *weights,
-                                                                const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-
-    // Reshape input shape if in NHWC format
-    const DataLayout data_layout = src->data_layout();
-    TensorShape      in_shape{ src->tensor_shape() };
-    if(data_layout == DataLayout::NHWC)
-    {
-        in_shape.set(Window::DimX, src->tensor_shape().y());
-        in_shape.set(Window::DimY, src->tensor_shape().z());
-        in_shape.set(Window::DimZ, src->tensor_shape().x());
-    }
-
-    // Check data type
-    const DataType input_type            = src->data_type();
-    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
-    const DataType weights_type          = weights->data_type();
-    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
-                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
-
-    // Check weighs size
-    std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
-    const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int     kernel_w               = weights->dimension(width_idx);
-    const unsigned int     kernel_h               = weights->dimension(height_idx);
-    bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
-
-    // Check for supported strides
-    const auto &strides           = info.pad_stride_info.stride();
-    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
-
-    // Check for supported padding
-    const auto    pad_top           = info.pad_stride_info.pad_top();
-    const auto    pad_right         = info.pad_stride_info.pad_right();
-    const auto    pad_bottom        = info.pad_stride_info.pad_bottom();
-    const auto    pad_left          = info.pad_stride_info.pad_left();
-    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
-    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
-    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
-    bool          supported_padding = is_same_padding || is_valid_padding;
-    // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-    bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));
-
-    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
-    {
-        is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
-    }
-
-    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
+    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
+    return act.type != arm_gemm::Activation::Type::None;
 }
 
 void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
 {
-    // Prepare assembly kernel
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
     prepare(tensors);
 
-    auto src       = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
-    auto dst       = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Setup inputs/outputs
-    ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
-    _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));
-
-    ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
-    const int   input_element_size = src->info()->element_size();
-    const int   input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
-    const int   input_row_stride   = src->info()->strides_in_bytes().z() / input_element_size;
-    const int   input_col_stride   = src->info()->strides_in_bytes().y() / input_element_size;
-    const void *input_ptr          = src->buffer() + src->info()->offset_first_element_in_bytes();
-    _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
-
-    ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
-    const int output_element_size = dst->info()->element_size();
-    const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
-    const int output_row_stride   = dst->info()->strides_in_bytes().z() / output_element_size;
-    const int output_col_stride   = dst->info()->strides_in_bytes().y() / output_element_size;
-    void     *output_ptr          = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
-
-    // Schedule assembly kernel
-    NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
+    NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors);
 }
 
 void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
 {
     if(!_pImpl->is_prepared)
     {
-        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);
-
-        ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);
-
         // Pack weights and bias
-        const int weights_element_size = weights->info()->element_size();
-        const int weights_row_stride   = weights->info()->strides_in_bytes().z() / weights_element_size;
-        const int weights_col_stride   = weights->info()->strides_in_bytes().y() / weights_element_size;
-        _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
-                                                 weights->buffer() + weights->info()->offset_first_element_in_bytes(),
-                                                 weights_row_stride,
-                                                 weights_col_stride,
-                                                 (bias != nullptr) ? bias->buffer() : nullptr);
-        _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());
+        const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        const ITensor *bias    = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        ITensor       *storage = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        const auto weights_ptr    = weights->buffer() + weights->info()->offset_first_element_in_bytes();
+        const auto bias_ptr       = (bias) ? bias->buffer() + bias->info()->offset_first_element_in_bytes() : nullptr;
+        auto       parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
+
+        const auto weights_shape   = weights->info()->tensor_shape();
+        const auto weights_padding = weights->info()->padding();
+
+        const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
+        const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
+        _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
 
         weights->mark_as_unused();
         if(bias != nullptr)

diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
index 195942b..7084516 100644
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
+++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h

@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H
+#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
+#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
 
 #include "src/core/common/Macros.h"
 #include "src/runtime/cpu/ICpuOperator.h"
@@ -40,15 +40,15 @@
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch);
     /** Default destructor */
     ~CpuDepthwiseConv2dAssemblyDispatch();
-
     /** Initialize the function's source, destination, kernels and border_size.
      *
      * @note Supports only NHWC format
      *
-     * @param[in]  src     Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p src.
+     * @param[in]  src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM].
+     *                     Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p src.
+     *                     Data type supported: same as @p src or S32 if @p src is quantized.
      * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
      * @param[in]  info    Depthwise convolution meta-data.
      */
@@ -60,18 +60,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
-    /** Check if the optimized kernel can be used for the given kernel sizes and strides
+    /** Checks if activation is supported by the assembly kernels
      *
-     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
+     * @param[in] activation Activation to check
      *
-     * @param[in] src     Input tensor info.
-     * @param[in] weights Weights tensor info.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
+     * @return True if activation is supported else false
      */
-    static bool is_optimized_supported(const ITensorInfo *src, const ITensorInfo *weights, const ConvolutionInfo &info);
-
+    static bool is_activation_supported(const ActivationLayerInfo &activation);
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
     void prepare(ITensorPack &tensors) override;
@@ -83,4 +78,4 @@
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H */
+#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H */

diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index ea3742f..1101e05 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp

@@ -27,6 +27,7 @@
 #include "src/core/CPP/Validate.h"
 #include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
 #include "src/core/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/core/utils/AssemblyUtils.h"
 
 #include <arm_neon.h>
 #include <cstdlib>
@@ -89,38 +90,6 @@
     return p;
 }
 
-arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
-{
-    arm_gemm::Activation gemm_act;
-
-    // Early exit in case lower bound is other than 0, as it's not yet supported
-    if(act.b() != 0.f)
-    {
-        return gemm_act;
-    }
-
-    switch(act.activation())
-    {
-        case ActivationLayerInfo::ActivationFunction::RELU:
-            gemm_act.type = arm_gemm::Activation::Type::ReLU;
-            break;
-        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
-            gemm_act.param1 = act.a();
-            gemm_act.param2 = 0.f;
-            break;
-        case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
-            gemm_act.param1 = act.a();
-            gemm_act.param2 = act.b();
-            break;
-        default:
-            gemm_act.type = arm_gemm::Activation::Type::None;
-    }
-
-    return gemm_act;
-}
-
 IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
 {
     // Schedule assembly kernel
@@ -788,14 +757,14 @@
 
 bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
 {
-    arm_gemm::Activation act = map_to_arm_gemm_activation(activation);
+    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
     return act.type != arm_gemm::Activation::Type::None;
 }
 
 void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info);
+    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
     if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
commit	d02d5edfa15ba6c04a9986a8a362a945cb38ac31	[log] [tgz]
author	Michele Di Giorgio <michele.digiorgio@arm.com>	Fri Jan 22 09:47:04 2021 +0000
committer	Michele Di Giorgio <michele.digiorgio@arm.com>	Fri Jun 18 10:33:48 2021 +0000
tree	ced4f49691d6c7038e347a8709b315bff59c64cf
parent	b014c27ba6db9840e4a72519760d51a87a2af7e7 [diff]